Analysis of the performance of US-listed securities during COVID-19 Pandemic
Data Preparation
Extract user holding from multiple files
# Extract user holding from multiple files
folder_path = "C:/stock_popularity_history"
file_list <- list.files(folder_path, full.names = TRUE)
get_data <- function(file_path) {
file_name <- str_remove(basename(file_path), "\\..+$")
data <- read.csv(file_path)
data <- data %>% mutate(TICKER = file_name)
return(data)
}
data_list <- file_list %>% map_df(~ get_data(.))
user_holding <- bind_rows(data_list)
user_holding$timestamp <- as.POSIXct(user_holding$timestamp, format = "%Y-%m-%d %H:%M:%S")
user_holding$timestamp <- as.Date(user_holding$timestamp, format = "%Y-%m-%d")
user_holding <- user_holding %>% group_by(TICKER, timestamp) %>% mutate(users_holding = mean(users_holding)) %>% distinct(timestamp, TICKER, .keep_all = TRUE)
user_holding <- user_holding %>% filter(timestamp >= '2019-08-20')
write.csv(user_holding, file = "user_holding.csv", row.names = FALSE)Data import
Transform date format
# Make sure all the "date" are converted to ISO 8601 format
Stock_data_1$date <- as.Date(Stock_data_1$date, format = "%Y-%m-%d")
risk_free_rate$date <- as.Date(risk_free_rate$date, format = "%Y-%m-%d")
fama_french_factors$date <- as.Date(fama_french_factors$date, format = "%Y%m%d")
fama_french_factors$date <- format(fama_french_factors$date, format = "%Y-%m-%d")
fama_french_factors$date <- as.Date(fama_french_factors$date, format = "%Y-%m-%d")
userholding$timestamp <- as.Date(userholding$timestamp, format = "%Y-%m-%d")## Inspect dataset structure:
## 'data.frame': 1938801 obs. of 17 variables:
## $ PERMNO : int 10026 10026 10026 10026 10026 10026 10026 10026 10026 10026 ...
## $ date : Date, format: "2019-08-20" "2019-08-21" ...
## $ SHRCD : int 11 11 11 11 11 11 11 11 11 11 ...
## $ TICKER : chr "JJSF" "JJSF" "JJSF" "JJSF" ...
## $ COMNAM : chr "J & J SNACK FOODS CORP" "J & J SNACK FOODS CORP" "J & J SNACK FOODS CORP" "J & J SNACK FOODS CORP" ...
## $ PERMCO : int 7976 7976 7976 7976 7976 7976 7976 7976 7976 7976 ...
## $ BIDLO : num 191 189 188 186 187 ...
## $ ASKHI : num 197 193 190 190 191 ...
## $ PRC : num 191 189 189 186 191 ...
## $ VOL : int 136698 101583 92198 75522 81788 85299 67790 62266 113788 93747 ...
## $ RET : chr "-0.020298" "-0.009313" "-0.000158" "-0.016744" ...
## $ BID : num 191 189 189 186 191 ...
## $ ASK : num 191 189 190 186 191 ...
## $ SHROUT : int 18841 18841 18841 18841 18841 18841 18841 18841 18841 18841 ...
## $ OPENPRC: num 195 192 189 189 187 ...
## $ NUMTRD : num 1903 2252 1805 1629 2070 ...
## $ sprtrn : num -0.007915 0.008247 -0.000506 -0.025946 0.010983 ...
Data Wrangling
1. Handle missing values and duplicate data
# Clean data
dataset1 <- Stock_data_1 %>%
mutate(
PERMNO = as.character(PERMNO),
PRC = abs(PRC),
BIDLO = ifelse(BIDLO == 0.0, NA, BIDLO),
ASKHI = ifelse(ASKHI == 0.0, NA, ASKHI),
PRC = ifelse(PRC == 0.0, NA, PRC),
VOL = ifelse(VOL == -99, NA, VOL),
RET = case_when(
RET %in% c(-44, -55, -66, -77, -88, -99) ~ NA,
RET %in% c(".A", ".B", ".C", ".D") ~ NA,
TRUE ~ as.numeric(as.character(RET))
),
ASK = ifelse(ASK - BID < 0, NA, as.numeric(as.character(ASK))),
BID = ifelse(ASK - BID < 0, NA, as.numeric(as.character(BID))),
NUMTRD = ifelse(NUMTRD == 99, NA, NUMTRD)
) %>%
# Remove duplicate observations
distinct(PERMNO, date, .keep_all = TRUE)2. Remove low-observation securities
# Count observation for each unique security
total_count <- dataset1 %>%
filter(date < "2020-03-20") %>%
group_by(PERMNO) %>%
summarise(Count = n(), .groups = 'drop')
summary(total_count$Count)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 147.0 147.0 141.7 147.0 147.0
Despite the popular frequency is 147 observations, but it may reduce the sample size significantly if using 147 as the threshold, hence, I decided to use the lower bounds in COVID period as the threshold
# Define the threshold for low-observation (using the COVID period as the boundary)
obs_covid <- dataset1 %>%
filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
group_by(PERMNO) %>%
summarise(Count = n(), .groups = 'drop')
obs_covid_summary <- summary(obs_covid$Count)
summary(obs_covid$Count)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 25.00 25.00 24.78 25.00 25.00
# Calculate lower bounds for observation counts
# Formula: 25th percentile - {1.5 * (75th percentile - 25th percentile)}
obs_lower_bound_covid <- obs_covid_summary["1st Qu."] - 1.5 * (obs_covid_summary["3rd Qu."] - obs_covid_summary["1st Qu."])
# Remove low-observation securities
dataset1_obs_removed <- dataset1 %>%
filter((PERMNO %in% total_count$PERMNO[total_count$Count >= obs_lower_bound_covid])) %>%
ungroup()## Number of unique security (before remove low-obs): 8183
## Number of unique security (after remove low-obs): 7846
3. Assign TICKER to those have PERMNO in the entire period but missing TICKER in some dates
TICKER may be changed due to company name changes, mergers, or acquisition, but PERMNO is the most unique identifier of each security that won’t change just due to company movement.
# Check how many security is missing TICKER value while have PERMNO value
missing_ticker <- dataset1_obs_removed %>%
group_by(PERMNO) %>%
summarise(missing_ticker = any(TICKER == ""),
all_dates = n()) %>%
filter(missing_ticker) %>%
pull(PERMNO)
cat("Security that have PERMNO but missing TICKER (before cleaning): ", length(missing_ticker))## Security that have PERMNO but missing TICKER (before cleaning): 67
# Replace missing value TICKER if same PERMNO found
dataset1_cleaned <- dataset1_obs_removed %>%
group_by(PERMNO) %>%
mutate(TICKER = ifelse(TICKER == "",
first(na.omit(TICKER[TICKER != ""])),
TICKER)) %>%
ungroup()
# Check after the replace
missing_ticker_cleaned <- dataset1_cleaned %>%
group_by(PERMNO) %>%
summarise(missing_ticker = any(is.na(TICKER)),
all_dates = n()) %>%
filter(missing_ticker) %>%
pull(PERMNO)
cat("Security that have PERMNO but missing TICKER (after replacing): ", length(missing_ticker_cleaned))## Security that have PERMNO but missing TICKER (after replacing): 30
4. Add (Compute) variables for further analysis
4.1 Filter ETFs and Stocks
dataset1_filtered <- dataset1_cleaned %>%
filter(SHRCD == 73 | SHRCD == 11) %>%
filter(!is.na(RET) & !is.na(PRC))
cat("Number of unique security in the dataset: ",length(unique(dataset1_filtered$PERMNO)))## Number of unique security in the dataset: 5929
4.2 Compute dollar_vol, bid_ask_spread, turnover_ratio
4.3 Add risk-free rate
4.4 Compute Sharpe ratio and volatility
# Define the total trading days
trading_days_total <- length(unique(dataset1$date))
dataset1_df3 <- dataset1_df2 %>%
group_by(PERMNO) %>%
arrange(date) %>%
mutate(
# Compute daily mean return and standard deviation for each PERMNO
mean_ret = rollmean(RET, k = 30, fill = NA, align = "right", na.rm = TRUE),
std_ret = rollapply(RET, width = 30, FUN = sd, fill = NA, align = "right", na.rm = TRUE),
# Compute volatility
volatility = std_ret * sqrt(trading_days_total),
# Compute daily Sharpe Ratio
sharpe_ratio = ifelse(is.na(std_ret) | std_ret == 0, NA, (mean_ret - risk_free_rate) / std_ret)
) %>%
ungroup() %>%
select(-c("mean_ret", "std_ret")) 4.5 Add sector categories
# Add sector ID
dataset1_df4 <- dataset1_df3 %>%
left_join(industry_code, by = "TICKER") %>%
mutate(
SECTOR = case_when(
SECTOR == "Health Care" ~ "Health_Care",
SECTOR == "Information Technology" ~ "Information_Technology",
SECTOR == "Consumer Discretionary" ~ "Consumer_Discretionary",
SECTOR == "Consumer Staples" ~ "Consumer_Staples",
SECTOR == "Real Estate" ~ "Real_Estate",
SECTOR == "Communication Services" ~ "Communication_Services",
TRUE ~ SECTOR
),
# Assign all missing value as Unknown Sector to avoid dropping large amount of data
SECTOR = ifelse(is.na(SECTOR), "Unknown", SECTOR)
) %>%
select(-c(SECTOR_ID, INDUSTRY))4.7 Add market capitalisation and catogrise company size
dataset1_df6 <- dataset1_df5 %>%
filter(!is.na(SHROUT)) %>%
# Compute market capitalisation
mutate(
market_cap = SHROUT * PRC * 1000,
# Categorise company size for the security based on market capitalisation
company_size = case_when(
market_cap <= 2e9 ~ "Small",
market_cap > 2e9 & market_cap <= 1e10 ~ "Medium",
market_cap > 1e10 ~ "Large"
)
)
# Inspect the unique security count of each company size
print(dataset1_df6 %>%
group_by(company_size) %>%
summarise(distinct_tickers = n_distinct(PERMNO)))## # A tibble: 3 × 2
## company_size distinct_tickers
## <chr> <int>
## 1 Large 648
## 2 Medium 1331
## 3 Small 4843
4.8 Add absolute correlation between RET and sprtrn
4.9 Add dummy variables to identify ETFs or stocks
4.10 Tracking error (ETF)
To see how closely an investment portfolio’s returns align with the return of a benchmark index. The standard deviation of the difference between returns and benchmark returns.
Inspect the data
To make sure no PERMNO accidentally dropped during data wrangling
## Number of unique security before data wrangling: 5929
## Number of unique security after data wrangling: 5929
## No unique security dropped during data wrangling: TRUE
## Preview of cleaned dataset:
## # A tibble: 5 × 30
## # Groups: PERMNO [5]
## PERMNO date SHRCD TICKER COMNAM PERMCO BIDLO ASKHI PRC VOL
## <chr> <date> <int> <chr> <chr> <int> <dbl> <dbl> <dbl> <int>
## 1 10026 2019-08-20 11 JJSF J & J SNACK… 7976 191. 197. 191. 136698
## 2 10028 2019-08-20 11 DGSE D G S E COM… 7978 0.77 1.05 0.88 91081
## 3 10032 2019-08-20 11 PLXS PLEXUS CORP 7980 56.9 57.5 57.1 125933
## 4 10044 2019-08-20 11 RMCF ROCKY MOUNT… 7992 9.16 9.35 9.34 6558
## 5 10051 2019-08-20 11 HNGR HANGER INC 7999 18.9 19.3 18.9 206241
## # ℹ 20 more variables: RET <dbl>, BID <dbl>, ASK <dbl>, SHROUT <int>,
## # OPENPRC <dbl>, NUMTRD <dbl>, sprtrn <dbl>, dollar_vol <dbl>,
## # bid_ask_spread <dbl>, turnover_ratio <dbl>, risk_free_rate <dbl>,
## # volatility <dbl>, sharpe_ratio <dbl>, SECTOR <chr>, market_share <dbl>,
## # market_cap <dbl>, company_size <chr>, abs_corr_sp <dbl>,
## # security_type <chr>, tracking_error <dbl>
## Summary of dataset:
## PERMNO date SHRCD TICKER
## Length:1445273 Min. :2019-08-20 Min. :11.00 Length:1445273
## Class :character 1st Qu.:2019-11-18 1st Qu.:11.00 Class :character
## Mode :character Median :2020-02-19 Median :11.00 Mode :character
## Mean :2020-02-18 Mean :34.06
## 3rd Qu.:2020-05-20 3rd Qu.:73.00
## Max. :2020-08-20 Max. :73.00
##
## COMNAM PERMCO BIDLO ASKHI
## Length:1445273 Min. : 5 Min. : 0.0 Min. : 0.0
## Class :character 1st Qu.:27739 1st Qu.: 10.9 1st Qu.: 11.5
## Mode :character Median :51451 Median : 25.5 Median : 25.9
## Mean :41071 Mean : 97.3 Mean : 99.4
## 3rd Qu.:54933 3rd Qu.: 49.4 3rd Qu.: 50.3
## Max. :57031 Max. :344550.0 Max. :347400.0
##
## PRC VOL RET BID
## Min. : 0.0 Min. : 0 Min. :-0.862893 Min. : 0.0
## 1st Qu.: 11.2 1st Qu.: 14900 1st Qu.:-0.011494 1st Qu.: 11.1
## Median : 25.7 Median : 138072 Median : 0.000372 Median : 25.6
## Mean : 98.3 Mean : 1266886 Mean : 0.000953 Mean : 98.3
## 3rd Qu.: 49.9 3rd Qu.: 682045 3rd Qu.: 0.011945 3rd Qu.: 49.9
## Max. :344970.0 Max. :990919210 Max. :10.251815 Max. :345140.0
## NA's :53
## ASK SHROUT OPENPRC NUMTRD
## Min. : 0.0 Min. : 9 Min. : 0.0 Min. : 0
## 1st Qu.: 11.2 1st Qu.: 4627 1st Qu.: 11.1 1st Qu.: 204
## Median : 25.7 Median : 23750 Median : 25.8 Median : 1294
## Mean : 98.5 Mean : 92374 Mean : 99.9 Mean : 6339
## 3rd Qu.: 50.0 3rd Qu.: 65639 3rd Qu.: 50.3 3rd Qu.: 4916
## Max. :345589.0 Max. :9308301 Max. :345000.0 Max. :1640000
## NA's :53 NA's :32427 NA's :793232
## sprtrn dollar_vol bid_ask_spread turnover_ratio
## Min. :-0.1198410 Min. :0.000e+00 Min. :0.00000 Min. : 0.0000
## 1st Qu.:-0.0040110 1st Qu.:2.145e+05 1st Qu.:0.00056 1st Qu.: 0.0025
## Median : 0.0018440 Median :2.234e+06 Median :0.00155 Median : 0.0058
## Mean : 0.0007867 Mean :6.393e+07 Mean :0.00592 Mean : 0.0230
## 3rd Qu.: 0.0076950 3rd Qu.:1.934e+07 3rd Qu.:0.00456 3rd Qu.: 0.0126
## Max. : 0.0938280 Max. :1.143e+11 Max. :1.98333 Max. :770.2883
## NA's :53
## risk_free_rate volatility sharpe_ratio SECTOR
## Min. :0.110 Min. : 0.00 Min. :-38267.36 Length:1445273
## 1st Qu.:0.170 1st Qu.: 0.18 1st Qu.: -101.84 Class :character
## Median :1.420 Median : 0.39 Median : -20.06 Mode :character
## Mean :0.909 Mean : 0.55 Mean : -114.36
## 3rd Qu.:1.530 3rd Qu.: 0.75 3rd Qu.: -4.18
## Max. :1.830 Max. :30.07 Max. : 0.10
## NA's :171888 NA's :171888
## market_share market_cap company_size abs_corr_sp
## Min. :0.0 Min. :2.970e+04 Length:1445273 Min. :0.0008779
## 1st Qu.:0.0 1st Qu.:5.880e+07 Class :character 1st Qu.:0.3884390
## Median :0.0 Median :3.302e+08 Mode :character Median :0.6248121
## Mean :0.0 Mean :5.992e+09 Mean :0.5898372
## 3rd Qu.:0.0 3rd Qu.:1.971e+09 3rd Qu.:0.8117317
## Max. :2.3 Max. :2.023e+12 Max. :1.0000000
## NA's :800157
## security_type tracking_error
## Length:1445273 Min. :0.0007778
## Class :character 1st Qu.:0.0158461
## Mode :character Median :0.0261979
## Mean :0.0348746
## 3rd Qu.:0.0441767
## Max. :0.6425179
##
5. Create function to save the plots
# To reduce the workload of manually saving every plot, create a function to save plot
save_plot <- function(plot, title) {
# Define the file path and name
file_name <- paste0(title, ".png")
file_path <- file.path(getwd(), file_name)
# Define the format
ggsave(filename = file_path,
plot = plot,
width = 12,
height = 6,
units = "in",
dpi = 600)
}1 Descriptive statistics & visualisation
# load library
library(e1071)
library(purrr)
library(ggplot2)
library(RColorBrewer)
library(plotly)
library(scales)Descriptive Statistics
Overall
Transform the dataset into a time series dataset:
# Group by date and summarise for each variable by aggregation
task1_overall <- dataset1_df9 %>%
group_by(date) %>%
summarise(
RET = mean(RET, na.rm = TRUE),
volatility = mean(volatility, na.rm = TRUE),
sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
dollar_vol = sum(dollar_vol, na.rm = TRUE),
number_of_trade = sum(NUMTRD, na.rm = TRUE),
bid_ask_spread = mean(bid_ask_spread, na.rm = TRUE),
turnover_ratio = mean(turnover_ratio, na.rm = TRUE),
.groups = "drop"
) %>%
# Standardise variable to ensure they are in same scale
mutate(
scaled_RET = scale(RET),
scaled_sharpe_ratio = scale(sharpe_ratio),
scaled_volatility = scale(volatility),
scaled_spread = scale(bid_ask_spread),
scaled_dollar_vol = scale(dollar_vol),
scaled_numtrd = scale(number_of_trade),
scaled_turnover_ratio = scale(turnover_ratio)
)
head(task1_overall, n = 5)## # A tibble: 5 × 15
## date RET volatility sharpe_ratio dollar_vol number_of_trade
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019-08-20 -0.00390 NaN NaN 220253155332. 9213736
## 2 2019-08-21 0.00755 NaN NaN 218034206986. 9096519
## 3 2019-08-22 -0.00264 NaN NaN 223328802822. 9374476
## 4 2019-08-23 -0.0222 NaN NaN 359904889558. 12753999
## 5 2019-08-26 0.00829 NaN NaN 232556980779. 9642102
## # ℹ 9 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## # scaled_RET <dbl[,1]>, scaled_sharpe_ratio <dbl[,1]>,
## # scaled_volatility <dbl[,1]>, scaled_spread <dbl[,1]>,
## # scaled_dollar_vol <dbl[,1]>, scaled_numtrd <dbl[,1]>,
## # scaled_turnover_ratio <dbl[,1]>
Create the descriptive statistics summary for all variables for both period:
# Non-COVID period
task1_overall_noncovid <- task1_overall %>%
filter(date >= "2019-10-14" & date <= "2019-11-20") %>%
# Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
filter(date >= "2019-10-01") %>%
select(-date, -starts_with("scaled_")) %>%
summarise(across(everything(), list(
min = ~ min(.x, na.rm = TRUE),
q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE),
q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE)
),
.names = "{.col}_{.fn}"))
# COVID period
task1_overall_covid <- task1_overall %>%
filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
# Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
filter(date >= "2019-10-01") %>%
select(-date, -starts_with("scaled_")) %>%
summarise(across(everything(), list(
min = ~ min(.x, na.rm = TRUE),
q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE),
q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE)
),
.names = "{.col}_{.fn}"))
print(task1_overall_noncovid)## # A tibble: 1 × 49
## RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd volatility_min
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.00508 -0.00207 0.00116 0.000432 0.00377 0.0139 0.00452 0.326
## # ℹ 41 more variables: volatility_q25 <dbl>, volatility_mean <dbl>,
## # volatility_median <dbl>, volatility_q75 <dbl>, volatility_max <dbl>,
## # volatility_sd <dbl>, sharpe_ratio_min <dbl>, sharpe_ratio_q25 <dbl>,
## # sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>, sharpe_ratio_q75 <dbl>,
## # sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>, dollar_vol_min <dbl>,
## # dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>, dollar_vol_median <dbl>,
## # dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, dollar_vol_sd <dbl>, …
## # A tibble: 1 × 49
## RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd volatility_min
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.106 -0.0282 -0.0151 -0.00916 0.00576 0.0567 0.0431 0.351
## # ℹ 41 more variables: volatility_q25 <dbl>, volatility_mean <dbl>,
## # volatility_median <dbl>, volatility_q75 <dbl>, volatility_max <dbl>,
## # volatility_sd <dbl>, sharpe_ratio_min <dbl>, sharpe_ratio_q25 <dbl>,
## # sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>, sharpe_ratio_q75 <dbl>,
## # sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>, dollar_vol_min <dbl>,
## # dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>, dollar_vol_median <dbl>,
## # dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, dollar_vol_sd <dbl>, …
By Industry Sector
Transform the dataset into a panel (Date & Industry)
# Group by date and SECTOR, summarise for each variable by aggregation
task1_sector <- dataset1_df9 %>%
group_by(SECTOR, date) %>%
summarise(
RET = mean(RET, na.rm = TRUE),
volatility = mean(volatility, na.rm = TRUE),
sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
dollar_vol = sum(dollar_vol, na.rm = TRUE),
number_of_trade = sum(NUMTRD, na.rm = TRUE),
bid_ask_spread = mean(bid_ask_spread, na.rm = TRUE),
turnover_ratio = mean(turnover_ratio, na.rm = TRUE),
.groups = "drop"
) %>%
# Standardise variable to ensure they are in same scale
mutate(
scaled_RET = scale(RET),
scaled_sharpe_ratio = scale(sharpe_ratio),
scaled_volatility = scale(volatility),
scaled_spread = scale(bid_ask_spread),
scaled_dollar_vol = scale(dollar_vol),
scaled_numtrd = scale(number_of_trade),
scaled_turnover_ratio = scale(turnover_ratio)
)
head(task1_sector, n = 5)## # A tibble: 5 × 16
## SECTOR date RET volatility sharpe_ratio dollar_vol number_of_trade
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Commun… 2019-08-20 -0.00360 NaN NaN 1.19e10 601204
## 2 Commun… 2019-08-21 0.00552 NaN NaN 1.04e10 597822
## 3 Commun… 2019-08-22 -0.00377 NaN NaN 1.04e10 568559
## 4 Commun… 2019-08-23 -0.0238 NaN NaN 1.52e10 776119
## 5 Commun… 2019-08-26 0.0118 NaN NaN 1.20e10 620346
## # ℹ 9 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## # scaled_RET <dbl[,1]>, scaled_sharpe_ratio <dbl[,1]>,
## # scaled_volatility <dbl[,1]>, scaled_spread <dbl[,1]>,
## # scaled_dollar_vol <dbl[,1]>, scaled_numtrd <dbl[,1]>,
## # scaled_turnover_ratio <dbl[,1]>
Create the descriptive statistics summary for all variables for both period
# Non-COVID period
task1_sector_noncovid <- task1_sector %>%
filter(date >= "2019-10-14" & date <= "2019-11-20") %>%
# Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
filter(date >= "2019-10-01") %>%
select(-date, -starts_with("scaled_")) %>%
group_by(SECTOR) %>%
summarise(across(everything(), list(
min = ~ min(.x, na.rm = TRUE),
q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE),
q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE)
),
.names = "{.col}_{.fn}")) %>%
arrange(SECTOR)
# COVID period
task1_sector_covid <- task1_sector %>%
filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
# Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
filter(date >= "2019-10-01") %>%
select(-date, -starts_with("scaled_")) %>%
group_by(SECTOR) %>%
summarise(across(everything(), list(
min = ~ min(.x, na.rm = TRUE),
q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE),
q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE)
),
.names = "{.col}_{.fn}")) %>%
arrange(SECTOR)
print(task1_sector_noncovid)## # A tibble: 12 × 50
## SECTOR RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Communication_… -0.0103 -0.00402 1.42e-3 0.00216 0.00703 0.0160 0.00732
## 2 Consumer_Discr… -0.0126 -0.00294 1.33e-3 0.00235 0.00547 0.0128 0.00608
## 3 Consumer_Stapl… -0.00737 -0.00114 3.92e-4 0.000184 0.00192 0.00966 0.00414
## 4 Energy -0.0306 -0.0121 -1.92e-3 -0.00265 0.00589 0.0435 0.0175
## 5 Financials -0.00650 -0.00141 1.93e-3 0.00161 0.00504 0.0122 0.00467
## 6 Health_Care -0.0140 -0.00291 2.97e-3 0.00202 0.00645 0.0255 0.00930
## 7 Industrials -0.00984 -0.00271 8.61e-4 0.0000116 0.00380 0.0198 0.00650
## 8 Information_Te… -0.0101 -0.00320 1.58e-3 0.00229 0.00701 0.0201 0.00710
## 9 Materials -0.0179 -0.00396 1.28e-4 -0.000200 0.00463 0.0261 0.00842
## 10 Real_Estate -0.0109 -0.00211 1.42e-3 0.00212 0.00401 0.0167 0.00596
## 11 Unknown -0.00403 -0.00146 9.18e-4 0.000280 0.00355 0.0111 0.00364
## 12 Utilities -0.0169 -0.00443 -7.81e-4 0.00158 0.00304 0.0112 0.00690
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## # volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## # volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## # sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## # sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## # dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## # dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …
## # A tibble: 12 × 50
## SECTOR RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Communication_Ser… -0.119 -0.0347 -0.0199 -0.0225 6.19e-3 0.0998 0.0523
## 2 Consumer_Discreti… -0.156 -0.0395 -0.0242 -0.0198 6.93e-3 0.0908 0.0560
## 3 Consumer_Staples -0.0965 -0.0242 -0.0103 -0.0115 8.52e-4 0.0648 0.0408
## 4 Energy -0.241 -0.0494 -0.0259 -0.0157 6.57e-3 0.179 0.0808
## 5 Financials -0.121 -0.0365 -0.0171 -0.0217 4.18e-3 0.0858 0.0539
## 6 Health_Care -0.125 -0.0209 -0.0107 -0.00270 1.34e-2 0.0852 0.0487
## 7 Industrials -0.115 -0.0367 -0.0192 -0.0162 5.69e-3 0.0609 0.0490
## 8 Information_Techn… -0.120 -0.0320 -0.0167 -0.0187 1.44e-2 0.0689 0.0489
## 9 Materials -0.116 -0.0405 -0.0170 -0.00972 6.92e-3 0.0702 0.0509
## 10 Real_Estate -0.152 -0.0388 -0.0208 -0.0143 9.39e-3 0.106 0.0560
## 11 Unknown -0.0917 -0.0241 -0.0135 -0.00668 3.98e-3 0.0493 0.0368
## 12 Utilities -0.119 -0.0406 -0.0124 -0.0122 2.71e-3 0.138 0.0538
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## # volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## # volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## # sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## # sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## # dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## # dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …
By Company size
Transform the dataset into a panel (Date & Company Size)
# Group by date and company_size, summarise for each variable by aggregation
task1_size <- dataset1_df9 %>%
group_by(company_size, date) %>%
summarise(
RET = mean(RET, na.rm = TRUE),
volatility = mean(volatility, na.rm = TRUE),
sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
dollar_vol = sum(dollar_vol, na.rm = TRUE),
number_of_trade = sum(NUMTRD, na.rm = TRUE),
bid_ask_spread = mean(bid_ask_spread, na.rm = TRUE),
turnover_ratio = mean(turnover_ratio, na.rm = TRUE),
.groups = "drop"
) %>%
# Standardise variable to ensure they are in same scale
mutate(
scaled_RET = scale(RET),
scaled_sharpe_ratio = scale(sharpe_ratio),
scaled_volatility = scale(volatility),
scaled_spread = scale(bid_ask_spread),
scaled_dollar_vol = scale(dollar_vol),
scaled_numtrd = scale(number_of_trade),
scaled_turnover_ratio = scale(turnover_ratio)
) %>%
arrange(desc(company_size), date)
head(task1_size, n = 5)## # A tibble: 5 × 16
## company_size date RET volatility sharpe_ratio dollar_vol
## <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 Small 2019-08-20 -0.00316 NaN NaN 18029134547.
## 2 Small 2019-08-21 0.00723 NaN NaN 18198242694.
## 3 Small 2019-08-22 -0.00332 NaN NaN 19050070813.
## 4 Small 2019-08-23 -0.0212 NaN NaN 30848625441.
## 5 Small 2019-08-26 0.00824 NaN NaN 19840213213.
## # ℹ 10 more variables: number_of_trade <dbl>, bid_ask_spread <dbl>,
## # turnover_ratio <dbl>, scaled_RET <dbl[,1]>, scaled_sharpe_ratio <dbl[,1]>,
## # scaled_volatility <dbl[,1]>, scaled_spread <dbl[,1]>,
## # scaled_dollar_vol <dbl[,1]>, scaled_numtrd <dbl[,1]>,
## # scaled_turnover_ratio <dbl[,1]>
Create the descriptive statistics summary for all variables for both period
# Non-COVID period
task1_size_noncovid <- task1_size %>%
filter(date >= "2019-10-14" & date <= "2019-11-20") %>%
# Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
select(-date, -starts_with("scaled_")) %>%
group_by(company_size) %>%
summarise(across(everything(), list(
min = ~ min(.x, na.rm = TRUE),
q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE),
q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE)
),
.names = "{.col}_{.fn}")) %>%
arrange(desc(company_size))
# COVID period
task1_size_covid <- task1_size %>%
# Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
select(-date, -starts_with("scaled_")) %>%
group_by(company_size) %>%
summarise(across(everything(), list(
min = ~ min(.x, na.rm = TRUE),
q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE),
q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE)
),
.names = "{.col}_{.fn}")) %>%
arrange(desc(company_size))
print(task1_size_noncovid)## # A tibble: 3 × 50
## company_size RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Small -0.00574 -0.00267 0.000853 0.000216 0.00326 0.0141 0.00476
## 2 Medium -0.00611 -0.00146 0.00221 0.00152 0.00567 0.0149 0.00479
## 3 Large -0.00495 -0.000924 0.00186 0.00163 0.00369 0.0109 0.00361
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## # volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## # volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## # sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## # sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## # dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## # dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …
## # A tibble: 3 × 50
## company_size RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Small -0.103 -0.0267 -0.0151 -0.00893 0.00569 0.0573 0.0424
## 2 Medium -0.120 -0.0324 -0.0161 -0.0113 0.00661 0.0739 0.0474
## 3 Large -0.110 -0.0350 -0.0138 -0.0101 0.00483 0.0776 0.0447
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## # volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## # volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## # sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## # sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## # dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## # dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …
By Security Types
Transform the dataset into a panel (Date & Security Type)
# Group by date and security_type, summarise for each variable by aggregation
task1_type <- dataset1_df9 %>%
group_by(security_type, date) %>%
summarise(
RET = mean(RET, na.rm = TRUE),
volatility = mean(volatility, na.rm = TRUE),
sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
dollar_vol = sum(dollar_vol, na.rm = TRUE),
number_of_trade = sum(NUMTRD, na.rm = TRUE),
bid_ask_spread = mean(bid_ask_spread, na.rm = TRUE),
turnover_ratio = mean(turnover_ratio, na.rm = TRUE),
.groups = "drop"
) %>%
# Standardise variable to ensure they are in same scale
mutate(
scaled_RET = scale(RET),
scaled_sharpe_ratio = scale(sharpe_ratio),
scaled_volatility = scale(volatility),
scaled_spread = scale(bid_ask_spread),
scaled_dollar_vol = scale(dollar_vol),
scaled_numtrd = scale(number_of_trade),
scaled_turnover_ratio = scale(turnover_ratio)
) %>% arrange(desc(security_type), date)
head(task1_type, n = 5)## # A tibble: 5 × 16
## security_type date RET volatility sharpe_ratio dollar_vol
## <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 Stock 2019-08-20 -0.00465 NaN NaN 155680282032.
## 2 Stock 2019-08-21 0.00875 NaN NaN 152342708836.
## 3 Stock 2019-08-22 -0.00303 NaN NaN 154016031104.
## 4 Stock 2019-08-23 -0.0268 NaN NaN 215699996631.
## 5 Stock 2019-08-26 0.0101 NaN NaN 150830388226.
## # ℹ 10 more variables: number_of_trade <dbl>, bid_ask_spread <dbl>,
## # turnover_ratio <dbl>, scaled_RET <dbl[,1]>, scaled_sharpe_ratio <dbl[,1]>,
## # scaled_volatility <dbl[,1]>, scaled_spread <dbl[,1]>,
## # scaled_dollar_vol <dbl[,1]>, scaled_numtrd <dbl[,1]>,
## # scaled_turnover_ratio <dbl[,1]>
Create the descriptive statistics summary for all variables for both period
# Non-COVID period
task1_type_noncovid <- task1_type %>%
filter(date >= "2019-10-14" & date <= "2019-11-20") %>%
# Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
select(-date, -starts_with("scaled_")) %>%
group_by(security_type) %>%
summarise(across(everything(), list(
min = ~ min(.x, na.rm = TRUE),
q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE),
q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE)
),
.names = "{.col}_{.fn}"))
# COVID period
task1_type_covid <- task1_type %>%
filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
# Exclude the first 29 days' observation to avoid the 30-day rolling window affect the summary statistics result
select(-date, -starts_with("scaled_")) %>%
group_by(security_type) %>%
summarise(across(everything(), list(
min = ~ min(.x, na.rm = TRUE),
q25 = ~ quantile(.x, 0.25, na.rm = TRUE),
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE),
q75 = ~ quantile(.x, 0.75, na.rm = TRUE),
max = ~ max(.x, na.rm = TRUE),
sd = ~ sd(.x, na.rm = TRUE)
),
.names = "{.col}_{.fn}"))
print(task1_type_noncovid)## # A tibble: 2 × 50
## security_type RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETF -0.00298 -0.000675 0.000909 0.0000341 0.00273 0.00846 0.00269
## 2 Stock -0.00738 -0.00292 0.00131 0.000338 0.00381 0.0171 0.00573
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## # volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## # volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## # sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## # sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## # dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## # dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …
## # A tibble: 2 × 50
## security_type RET_min RET_q25 RET_mean RET_median RET_q75 RET_max RET_sd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETF -0.0806 -0.0228 -0.0122 -0.00954 0.00387 0.0465 0.0321
## 2 Stock -0.121 -0.0319 -0.0168 -0.0124 0.00688 0.0749 0.0501
## # ℹ 42 more variables: volatility_min <dbl>, volatility_q25 <dbl>,
## # volatility_mean <dbl>, volatility_median <dbl>, volatility_q75 <dbl>,
## # volatility_max <dbl>, volatility_sd <dbl>, sharpe_ratio_min <dbl>,
## # sharpe_ratio_q25 <dbl>, sharpe_ratio_mean <dbl>, sharpe_ratio_median <dbl>,
## # sharpe_ratio_q75 <dbl>, sharpe_ratio_max <dbl>, sharpe_ratio_sd <dbl>,
## # dollar_vol_min <dbl>, dollar_vol_q25 <dbl>, dollar_vol_mean <dbl>,
## # dollar_vol_median <dbl>, dollar_vol_q75 <dbl>, dollar_vol_max <dbl>, …
Save the descriptive summary in csv files (For report)
# Overall
write.csv(task1_overall_noncovid, file = "task1_overall_noncovid.csv", row.names = FALSE)
write.csv(task1_overall_covid, file = "task1_overall_covid.csv", row.names = FALSE)
# By Sector
write.csv(task1_sector_noncovid, file = "task1_sector_noncovid.csv", row.names = FALSE)
write.csv(task1_sector_covid, file = "task1_sector_covid.csv", row.names = FALSE)
# By Company size
write.csv(task1_size_noncovid, file = "task1_size_noncovid.csv", row.names = FALSE)
write.csv(task1_size_covid, file = "task1_size_covid.csv", row.names = FALSE)
# By security type
write.csv(task1_type_noncovid, file = "task1_type_noncovid.csv", row.names = FALSE)
write.csv(task1_type_covid, file = "task1_type_covid.csv", row.names = FALSE)Time series plot
Function to create time series plot (Overall)
# Dataset for this plot is time series
time_series_plot_overall <- function(data, variables, var_names, periods) {
# Convert the dataset to long format
data_long <- data %>%
pivot_longer(cols = all_of(variables),
names_to = "variable",
values_to = "value") %>%
mutate(variable = factor(variable,
levels = variables,
labels = var_names)
)
# Create shaded areas for showing comparison periods
shaded_areas <- bind_rows(
lapply(periods, function(p) {
data.frame(xmin = as.Date(p$start),
xmax = as.Date(p$end),
ymin = -Inf,
ymax = Inf,
fill = p$color,
label = p$label)
})
)
# Create plots for each variable
create_plot <- function(var_name, dataset_name) {
plot_data <- data_long %>%
filter(variable == var_name)
p <- ggplot(plot_data, aes(x = date,
y = value)) +
geom_rect(data = shaded_areas,
aes(xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
fill = label),
alpha = 0.2,
inherit.aes = FALSE) +
geom_line(aes(color = "Data"),
size = 1) +
scale_fill_manual(values = setNames(unique(shaded_areas$fill),
unique(shaded_areas$label)),
name = "Period") +
guides(fill = guide_legend(title = "Periods for Comparison"),
color = guide_legend(title = "Metrics")) +
labs(title = paste("Overall Performance of", var_name, "during 20 August 2019 to 20 August 2020"),
x = "Date",
y = paste("Standardised", var_name)) +
scale_x_date(date_labels = "%Y-%m-%d", date_breaks = "1 month") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 55, hjust = 1)) +
scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) +
scale_color_manual(name = "Metrics",
values = c("Data" = "red"),
labels = c("Data" = var_name))
# Save plot for report
save_plot(p, paste0(dataset_name, "_", var_name))
return(p)
}
# To keep the saved plot file name easy to read
dataset_name <- deparse(substitute(data))
# Create a list of plots
plots <- lapply(var_names, function(var_name) {
create_plot(var_name, dataset_name)
})
# Print plots
for (plot in plots) {
print(plot)
}
}Function to create time series plot (By group)
time_series_plot_group <- function(data, variables, var_names, label_name, periods, group, color_map) {
# Convert the data to long format
data_long <- data %>%
pivot_longer(cols = all_of(variables),
names_to = "variable",
values_to = "value") %>%
mutate(variable = factor(variable,
levels = variables,
labels = var_names)) %>%
filter(!is.na(!!sym(group)))
# Define color palette for the group variable
group_colors <- brewer.pal(n = length(unique(data_long[[group]])),
name = color_map)
names(group_colors) <- unique(data_long[[group]])
# Create shaded areas for showing comparison periods
shaded_areas <- do.call(rbind,
lapply(periods, function(p) {
data.frame(xmin = as.Date(p$start),
xmax = as.Date(p$end),
ymin = -Inf,
ymax = Inf,
fill = p$color,
label = p$label)
})
)
# Create plots for each variable
create_plot <- function(var_name, dataset_name) {
plot_data <- data_long %>%
filter(variable == var_name)
# Initialise the plot
p <- ggplot() +
# Loop over each group and plot its points and lines in order
lapply(unique(plot_data[[group]]), function(g) {
list(
geom_point(data = plot_data %>% filter(!!sym(group) == g),
aes(x = date, y = value, color = !!sym(group)), size = 2, alpha = 0.9),
geom_line(data = plot_data %>% filter(!!sym(group) == g),
aes(x = date, y = value, color = !!sym(group)), size = 1)
)
}) +
# Create shaded comparison periods
geom_rect(data = shaded_areas,
aes(xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
fill = label),
alpha = 0.2,
inherit.aes = FALSE) +
# Customise colors and legends
scale_color_manual(values = group_colors,
name = label_name) +
scale_fill_manual(values = setNames(unique(shaded_areas$fill),
unique(shaded_areas$label)),
name = "Period") +
# Customise labels and titles
labs(title = paste("Overall Performance of", var_name, "during 20 August 2019 to 20 August 2020"),
subtitle = paste("( Group by", label_name, ")"),
x = "Date",
y = paste("Standardised", var_name)) +
# Customise the x-axis and y-axis
scale_x_date(date_labels = "%Y-%m-%d",
date_breaks = "1 month") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 55,
hjust = 1)) +
scale_y_continuous(expand = expansion(mult = c(0.05, 0.1)))
# Save the plot
save_plot(p, paste0(dataset_name, "_", var_name))
return(p)
}
dataset_name <- deparse(substitute(data))
plots <- lapply(var_names, function(var_name) {
create_plot(var_name, dataset_name)
})
# Print plots
for (plot in plots) {
print(plot)
}
}Call function to create plot(s)
# Define the variables and their corresponding names
variables_scaled <- c('scaled_RET', 'scaled_volatility', 'scaled_sharpe_ratio', 'scaled_spread', 'scaled_dollar_vol', 'scaled_numtrd', 'scaled_turnover_ratio')
var_names <- c('Returns', 'Volatility', 'Sharpe Ratio', 'Bid-Ask Spread', 'Dollar Volume', 'Number of Trade', 'Turnover Ratio')
# Define the dates and colors for shaded areas
periods_group <- list(
list(start = "2019-10-14", end = "2019-11-20", color = "#937613", label = "Non-COVID Period\n(2019-10-14 to 2019-11-20)"),
list(start = "2020-02-14", end = "2020-03-20", color = "black", label = "COVID Period\n(2020-02-14 to 2020-03-20)")
)
# Overall Performance
time_series_plot_overall(task1_overall, variables_scaled, var_names, periods_group)# By security type
time_series_plot_group(task1_type, variables_scaled, var_names, "Security Types", periods_group, 'security_type', color_map = "Dark2")# By industry sector
time_series_plot_group(task1_sector, variables_scaled, var_names,"Industry Sectors", periods_group, 'SECTOR', color_map = "Paired")# By Company size
time_series_plot_group(task1_size, variables_scaled, var_names, "Company sizes", periods_group, 'company_size', color_map = "Set1")Function to create time series plot (Market Performance and Risk Metrics)
market_risk <- function(data, variables, var_names, periods) {
# Convert data to long format
data_long <- data %>%
pivot_longer(cols = all_of(variables),
names_to = "variable",
values_to = "value") %>%
mutate(variable = factor(variable,
levels = variables,
labels = var_names)
)
# Create shaded areas data frame
shaded_areas <- do.call(rbind,
lapply(periods, function(p) {
data.frame(xmin = as.Date(p$start),
xmax = as.Date(p$end),
ymin = -Inf,
ymax = Inf,
fill = p$color,
label = p$label)
})
)
# Define color mapping for each variable
color_mapping <- c(
"Returns" = "#f39906",
"Volatility" = "#6402bb",
"Sharpe Ratio" = "#32cd4c"
)
# Define line type mapping
linetype_mapping <- c(
"Returns" = "solid",
"Volatility" = "solid",
"Sharpe Ratio" = "solid"
)
# Create the plot
ggplot() +
# Plot primary variables with unique colors and line types
geom_line(data = data_long,
aes(x = date,
y = value,
color = variable,
linetype = variable)
) +
# Add shaded areas
geom_rect(data = shaded_areas,
aes(xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
fill = label),
alpha = 0.2,
inherit.aes = FALSE) +
# Define color and line type scales
scale_color_manual(name = "Metrics",
values = color_mapping) +
scale_linetype_manual(name = "Metrics",
values = linetype_mapping) +
# Define fill scales for shaded areas
scale_fill_manual(values = setNames(unique(shaded_areas$fill),
unique(shaded_areas$label)),
name = "Period") +
# Define axis labels and limits
labs(title = "Market Performance and Risk Metrics: August 20, 2019 to August 20, 2020",
x = "Date",
y = "Standardised Values of Performance Metrics") +
scale_x_date(date_labels = "%Y-%m-%d", date_breaks = "1 month") +
scale_y_continuous(limits = c(-5.5, 5.5), breaks = seq(-5.0, 5.0, by = 2)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 55, hjust = 1))
}
# Define variables
variables_risk <- c('scaled_RET', 'scaled_volatility', 'scaled_sharpe_ratio')
var_names_risk <- c('Returns', 'Volatility', 'Sharpe Ratio')
# Call function to create plot
performance_risk <- market_risk(task1_overall, variables_risk, var_names_risk, periods_group)
print(performance_risk)Function to create time series plot (Market liquidity and trading behavior Metrics)
market_liquidity <- function(data, variables, var_names, periods) {
# Convert data to long format
data_long <- data %>%
pivot_longer(cols = all_of(variables),
names_to = "variable",
values_to = "value") %>%
mutate(variable = factor(variable,
levels = variables,
labels = var_names)
)
# Create shaded areas data frame
shaded_areas <- do.call(rbind,
lapply(periods, function(p) {
data.frame(xmin = as.Date(p$start),
xmax = as.Date(p$end),
ymin = -Inf,
ymax = Inf,
fill = p$color,
label = p$label)
})
)
# Define color mapping for each variable
color_mapping <- c(
"Bid-Ask Spread" = "#f39906",
"Dollar Volume" = "#6402bb",
"Number of Trade" = "#e85697",
"Turnover Ratio" = "#32cd4c",
"Volatility" = "black"
)
# Define line type mapping (solid for all except Max Drawdown)
linetype_mapping <- c(
"Bid-Ask Spread" = "solid",
"Dollar Volume" = "solid",
"Turnover Ratio" = "solid",
"Number of Trade" = "solid",
"Volatility" = "dashed"
)
# Create the plot
ggplot() +
# Plot primary variables with unique colors and line types
geom_line(data = data_long,
aes(x = date,
y = value,
color = variable,
linetype = variable)
) +
# Add shaded areas
geom_rect(data = shaded_areas,
aes(xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax,
fill = label),
alpha = 0.2,
inherit.aes = FALSE) +
# Define color and line type scales
scale_color_manual(name = "Metrics",
values = color_mapping) +
scale_linetype_manual(name = "Metrics",
values = linetype_mapping) +
# Define fill scales for shaded areas
scale_fill_manual(values = setNames(unique(shaded_areas$fill),
unique(shaded_areas$label)),
name = "Period") +
# Define axis labels and limits
labs(title = "Market Liquidity and Trading Behavior Metrics: August 20, 2019 to August 20, 2020",
x = "Date",
y = "Standardised Values of Performance Metrics") +
scale_x_date(date_labels = "%Y-%m-%d", date_breaks = "1 month") +
scale_y_continuous(limits = c(-3, 6), breaks = seq(-3, 6, by = 2)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 55, hjust = 1))
}
# Define variables
variables_liquidity <- c('scaled_volatility', 'scaled_spread', 'scaled_dollar_vol', 'scaled_numtrd', 'scaled_turnover_ratio')
var_names_liquidity <- c('Volatility', 'Bid-Ask Spread', 'Dollar Volume', 'Number of Trade', 'Turnover Ratio')
# Call function to create plot
liquidity_trading <- market_liquidity(task1_overall, variables_liquidity, var_names_liquidity, periods_group)
print(liquidity_trading)2 OLS Regressions
Data Preparation
Calculate Fama-French Factor for Non-COVID period
task2 <- dataset1_df9
# Prepare dataset
noncovid_filtered <- task2 %>%
filter(date < "2019-12-14") %>%
select(PERMNO, date, RET) %>%
left_join(fama_french_factors, by = "date")
# Initialise an empty list to store beta values
beta_list <- list()
# Loop through each unique PERMNO
for (permno in unique(noncovid_filtered$PERMNO)) {
# Filter the data for the current PERMNO
FFF_beta_noncovid <- subset(noncovid_filtered, PERMNO == permno)
# Run the regression to calculate betas
model_beta <- lm(RET ~ Mkt.RF + SMB + HML, data = FFF_beta_noncovid)
# Store the betas with the PERMNO
beta_list[[permno]] <- coef(model_beta)
}
# Convert the list of betas to a data frame
# Convert list to data frame with PERMNO names as row names
FFF_beta_noncovid <- do.call(rbind, lapply(names(beta_list), function(x) {
as.data.frame(t(beta_list[[x]]), row.names = x)
}))
# Reset row names to a column
FFF_beta_noncovid <- tibble::rownames_to_column(FFF_beta_noncovid, var = "PERMNO")
# Rename columns to reflect beta coefficients
FFF_beta_noncovid <- FFF_beta_noncovid %>%
rename(
Mkt_beta = Mkt.RF,
SMB_beta = SMB,
HML_beta = HML
) %>%
select(-"(Intercept)")
FFF_beta_noncovid$PERMNO <- as.character(FFF_beta_noncovid$PERMNO)
print(head(FFF_beta_noncovid, n = 5))## PERMNO Mkt_beta SMB_beta HML_beta
## 1 10026 0.0036246336 0.0018566596 -0.0024391910
## 2 10028 0.0187801943 0.0034850211 0.0017528928
## 3 10032 0.0151359631 0.0055842491 -0.0020193657
## 4 10044 0.0002896842 -0.0006347077 -0.0019975343
## 5 10051 0.0065680513 0.0135855432 0.0009297792
Calculate Fama-French Factor for COVID period
# Prepare dataset
covid_filtered <- task2 %>%
filter(date < "2020-02-14") %>%
select(PERMNO, date, RET) %>%
left_join(fama_french_factors, by = "date")
# Initialise an empty list to store beta values
beta_list <- list()
# Loop through each unique PERMNO
for (permno in unique(covid_filtered$PERMNO)) {
# Filter the data for the current PERMNO
FFF_beta_covid <- subset(covid_filtered, PERMNO == permno)
# Run the regression to calculate betas
model_beta <- lm(RET ~ Mkt.RF + SMB + HML, data = FFF_beta_covid)
# Store the betas with the PERMNO
beta_list[[permno]] <- coef(model_beta)
}
# Convert the list of betas to a data frame
# Convert list to data frame with PERMNO names as row names
FFF_beta_covid <- do.call(rbind, lapply(names(beta_list), function(x) {
as.data.frame(t(beta_list[[x]]), row.names = x)
}))
# Reset row names to a column
FFF_beta_covid <- tibble::rownames_to_column(FFF_beta_covid, var = "PERMNO")
# Rename columns to reflect beta coefficients
FFF_beta_covid <- FFF_beta_covid %>%
rename(
Mkt_beta = Mkt.RF,
SMB_beta = SMB,
HML_beta = HML
) %>%
select(-"(Intercept)")
FFF_beta_covid$PERMNO <- as.character(FFF_beta_covid$PERMNO)
print(head(FFF_beta_covid, n = 5))## PERMNO Mkt_beta SMB_beta HML_beta
## 1 10026 0.0016996314 0.001972067 -0.0005208422
## 2 10028 0.0142608706 -0.002256404 0.0002025232
## 3 10032 0.0141317851 0.005488963 0.0004350972
## 4 10044 0.0003835403 0.001919091 -0.0025091552
## 5 10051 0.0039712741 0.012994810 0.0029740526
Transform cross-sectional dataset function
transform_dataset_for_task2 <- function(data, start_date_RET, end_date_RET, start_date_others, end_date_others, type_of_security, FFF_dataset) {
epsilon <- 1e-6
# Find the date with the most PERMNOs having the highest price
highest_change <- data %>%
filter(date >= as.Date(start_date_RET) & date <= as.Date(end_date_RET)) %>%
group_by(PERMNO) %>%
filter(PRC == max(PRC)) %>%
ungroup() %>%
group_by(date) %>%
summarise(num_highest = n()) %>%
arrange(desc(num_highest))
highest_date <- highest_change$date[1]
cat("Highest Date: ")
print(highest_change$date[1])
# Find the date with the most PERMNOs having the lowest price
lowest_change <- data %>%
filter(date >= as.Date(start_date_RET) & date <= as.Date(end_date_RET)) %>%
group_by(PERMNO) %>%
filter(PRC == min(PRC)) %>%
ungroup() %>%
group_by(date) %>%
summarise(num_lowest = n()) %>%
arrange(desc(num_lowest))
lowest_date <- lowest_change$date[1]
cat("\nLowest Date: ")
print(lowest_change$date[1])
# Calculate the return using the lowest_date's and highest_date's price
data_RET <- data %>%
filter(date %in% c(highest_date, lowest_date)) %>%
select(PERMNO, date, PRC) %>%
spread(key = date, value = PRC, fill = NA) %>%
rename(price_highest = !!as.character(highest_date),
price_lowest = !!as.character(lowest_date)) %>%
mutate(returns = (price_highest - price_lowest) / price_lowest) %>%
select(PERMNO, returns)
cat("\nNumber of PERMNO in the dataset that calculated returns: ", length(unique(data_RET$PERMNO)), "\n")
# Filter the data for two periods
data_filtered <- data %>%
filter(date >= as.Date(start_date_others) & date <= as.Date(end_date_others)) %>%
arrange(PERMNO, date)
# Filter securities type and calculate aggregated variables
data_others <- data_filtered %>%
filter(
type_of_security == "both" |
(type_of_security == "ETFs" & SHRCD == 73) |
(type_of_security == "stocks" & SHRCD == 11)
) %>%
group_by(PERMNO) %>%
summarise(
SHRCD = first(SHRCD),
TICKER = first(TICKER),
SECTOR = first(SECTOR),
company_size = first(company_size),
VOL = sum(VOL, na.rm = TRUE),
dollar_vol = sum(dollar_vol, na.rm = TRUE),
bid_ask_spread = mean(bid_ask_spread, na.rm = TRUE),
turnover_ratio = mean(turnover_ratio, na.rm = TRUE),
volatility = mean(volatility, na.rm = TRUE),
sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
market_share = sum(market_share, na.rm = TRUE),
market_cap = sum(market_cap, na.rm = TRUE),
abs_corr_sp = mean(abs_corr_sp, na.rm = TRUE),
security_type = first(security_type),
tracking_error = mean(tracking_error, na.rm = TRUE),
log_vol = log(sum(VOL + epsilon, na.rm = TRUE)),
log_dollar_vol= log(sum(dollar_vol + epsilon, na.rm = TRUE)),
log_spread = log(mean(bid_ask_spread + epsilon, na.rm = TRUE)),
log_turnover_ratio = log(mean(turnover_ratio + epsilon, na.rm = TRUE)),
log_volatility = log(mean(volatility + epsilon, na.rm = TRUE)),
log_market_share = log(sum(market_share + epsilon, na.rm = TRUE)),
log_market_cap = log(sum(market_cap + epsilon, na.rm = TRUE)),
log_tracking_error = log(mean(tracking_error + epsilon, na.rm = TRUE)),
vol_cap_ratio = VOL / market_cap,
log_vol_cap_ratio = log(vol_cap_ratio),
.groups = "drop"
) %>%
mutate(
scaled_vol = scale(VOL),
scaled_dollar_vol = scale(dollar_vol),
scaled_spread = scale(bid_ask_spread),
scaled_turnover_ratio = scale(turnover_ratio),
scaled_volatility = scale(volatility),
scaled_sharpe_ratio = scale(sharpe_ratio),
scaled_market_share = scale(market_share),
scaled_market_cap = scale(market_cap),
scaled_abs_corr_sp = scale(abs_corr_sp),
scaled_tracking_error = scale(tracking_error)
)
# Check for duplicates
duplicates <- data_others %>%
group_by(PERMNO) %>%
filter(n() > 1) %>%
summarise(count = n())
# Debugging
if(nrow(duplicates) > 0) {
warning("There are duplicates: ", paste(duplicates$PERMNO, collapse = ", "))
}
cat("Number of PERMNO that are in the dataset that aggregate x variables: ", length(unique(data_others$PERMNO)), "\n")
cat("Number of PERMNO that are in the both datasets: ", length(unique(data_RET$PERMNO[data_RET$PERMNO %in% data_others$PERMNO])), "\n")
# Join the returns values with other variables
final_data <- data_RET %>%
filter(PERMNO %in% data_others$PERMNO) %>%
left_join(data_others, by = "PERMNO") %>%
ungroup() %>%
left_join(FFF_dataset, by = "PERMNO") %>%
filter(returns != 0)
cat("Number of PERMNO that are in the final dataset: ", length(unique(final_data$PERMNO)), "\n")
return(final_data)
}Function to remove outlier
#Function to remove outlier
remove_outliers_all <- function(data, variables) {
# Initialise a logical vector to keep track of rows to retain
keep_rows <- rep(TRUE, nrow(data))
# Loop over each variable
for (variable in variables) {
# Compute outlier bounds
bounds <- data %>%
summarise(
Q1 = quantile(.data[[variable]], 0.25, na.rm = TRUE),
Q3 = quantile(.data[[variable]], 0.75, na.rm = TRUE),
IQR = Q3 - Q1,
lower_bound = Q1 - 1.5 * IQR,
upper_bound = Q3 + 1.5 * IQR
)
# Apply the filtering based on computed bounds
keep_rows <- keep_rows & (data[[variable]] >= bounds$lower_bound & data[[variable]] <= bounds$upper_bound)
}
# Return the cleaned data
return(data[keep_rows, ])
}Model 1: Returns: 2019-12-14 to 2020-01-20 & X variables: before 2019-02-14
# Period 1 - Dataset 1
task2_p1_df1_1 <- transform_dataset_for_task2(task2, "2019-12-14", "2020-01-20", "2019-08-20", "2019-12-14", "both", FFF_beta_noncovid) ## Highest Date: [1] "2020-01-17"
##
## Lowest Date: [1] "2019-12-16"
##
## Number of PERMNO in the dataset that calculated returns: 5790
## Number of PERMNO that are in the dataset that aggregate x variables: 5840
## Number of PERMNO that are in the both datasets: 5746
## Number of PERMNO that are in the final dataset: 5682
# Remove outlier for returns
task2_p1_df1_1 <- remove_outliers_all(task2_p1_df1_1, "returns")
# Inspect the distribution of Returns
print(
ggplot(task2_p1_df1_1, aes(x = returns)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)## Sample size after remove outlier: 4919
# Create dummy variable for categorical variable
task2_reg_p1_df1 <- task2_p1_df1_1 %>%
filter(!is.na(volatility)) %>%
mutate(company_size = factor(company_size, levels = unique(task2_p1_df1_1$company_size)),
security_type = factor(security_type, levels = unique(task2_p1_df1_1$security_type)),
SECTOR = factor(SECTOR, levels = unique(task2_p1_df1_1$SECTOR))) %>%
bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
bind_cols(as_tibble(model.matrix(~ security_type - 1, data = .))) %>%
bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .))) ## tibble [4,881 × 57] (S3: tbl_df/tbl/data.frame)
## $ PERMNO : chr [1:4881] "10026" "10028" "10032" "10044" ...
## $ returns : num [1:4881] 0.01925 0.1259 0.01087 -0.00613 -0.00417 ...
## $ SHRCD : int [1:4881] 11 11 11 11 11 11 11 73 11 11 ...
## $ TICKER : chr [1:4881] "JJSF" "DGSE" "PLXS" "RMCF" ...
## $ SECTOR : Factor w/ 12 levels "Consumer_Staples",..: 1 2 3 1 2 3 3 2 4 5 ...
## $ company_size : Factor w/ 3 levels "Medium","Small",..: 1 2 2 2 2 3 3 2 3 3 ...
## $ VOL : num [1:4881] 7586162 4650390 13170524 658814 32686198 ...
## $ dollar_vol : num [1:4881] 1.44e+09 6.27e+06 9.09e+08 5.93e+06 7.24e+08 ...
## $ bid_ask_spread : num [1:4881] 0.00109 0.015923 0.000649 0.011499 0.000773 ...
## $ turnover_ratio : num [1:4881] 0.0049 0.00211 0.00551 0.00134 0.01069 ...
## $ volatility : num [1:4881] 0.176 0.658 0.28 0.153 0.364 ...
## $ sharpe_ratio : num [1:4881] -143.4 -38.2 -91 -161.8 -68.9 ...
## $ market_share : num [1:4881] 0.0286 0 0.0252 0.2334 0.0562 ...
## $ market_cap : num [1:4881] 2.94e+11 2.76e+09 1.62e+11 4.45e+09 6.75e+10 ...
## $ abs_corr_sp : num [1:4881] 0.72 0.318 0.635 0.193 0.595 ...
## $ security_type : Factor w/ 2 levels "Stock","ETF": 1 1 1 1 1 1 1 2 1 1 ...
## $ tracking_error : num [1:4881] 0.0225 0.0523 0.0261 0.0403 0.0324 ...
## $ log_vol : num [1:4881] 15.8 15.4 16.4 13.4 17.3 ...
## $ log_dollar_vol : num [1:4881] 21.1 15.7 20.6 15.6 20.4 ...
## $ log_spread : num [1:4881] -6.82 -4.14 -7.34 -4.47 -7.16 ...
## $ log_turnover_ratio : num [1:4881] -5.32 -6.16 -5.2 -6.61 -4.54 ...
## $ log_volatility : num [1:4881] -1.736 -0.418 -1.274 -1.878 -1.011 ...
## $ log_market_share : num [1:4881] -3.56 -13.82 -3.68 -1.46 -2.88 ...
## $ log_market_cap : num [1:4881] 26.4 21.7 25.8 22.2 24.9 ...
## $ log_tracking_error : num [1:4881] -3.79 -2.95 -3.65 -3.21 -3.43 ...
## $ vol_cap_ratio : num [1:4881] 2.58e-05 1.69e-03 8.15e-05 1.48e-04 4.84e-04 ...
## $ log_vol_cap_ratio : num [1:4881] -10.56 -6.38 -9.41 -8.82 -7.63 ...
## $ scaled_vol : num [1:4881, 1] -0.254 -0.265 -0.231 -0.281 -0.153 ...
## ..- attr(*, "scaled:center")= num 7.1e+07
## ..- attr(*, "scaled:scale")= num 2.5e+08
## $ scaled_dollar_vol : num [1:4881, 1] -0.0912 -0.1478 -0.1122 -0.1478 -0.1195 ...
## ..- attr(*, "scaled:center")= num 3.75e+09
## ..- attr(*, "scaled:scale")= num 2.53e+10
## $ scaled_spread : num [1:4881, 1] -0.373 0.919 -0.412 0.534 -0.401 ...
## ..- attr(*, "scaled:center")= num 0.00537
## ..- attr(*, "scaled:scale")= num 0.0115
## $ scaled_turnover_ratio : num [1:4881, 1] -0.0436 -0.0523 -0.0417 -0.0546 -0.0256 ...
## ..- attr(*, "scaled:center")= num 0.0189
## ..- attr(*, "scaled:scale")= num 0.321
## $ scaled_volatility : num [1:4881, 1] -0.451 0.825 -0.177 -0.513 0.045 ...
## ..- attr(*, "scaled:center")= num 0.347
## ..- attr(*, "scaled:scale")= num 0.378
## $ scaled_sharpe_ratio : num [1:4881, 1] 0.125 0.281 0.203 0.098 0.236 ...
## ..- attr(*, "scaled:center")= num -228
## ..- attr(*, "scaled:scale")= num 675
## $ scaled_market_share : num [1:4881, 1] -0.147 -0.152 -0.148 -0.117 -0.143 ...
## ..- attr(*, "scaled:center")= num 1.03
## ..- attr(*, "scaled:scale")= num 6.79
## $ scaled_market_cap : num [1:4881, 1] -0.0668 -0.1749 -0.1159 -0.1743 -0.1509 ...
## ..- attr(*, "scaled:center")= num 4.73e+11
## ..- attr(*, "scaled:scale")= num 2.69e+12
## $ scaled_abs_corr_sp : num [1:4881, 1] 0.5055 -0.9971 0.1873 -1.4662 0.0368 ...
## ..- attr(*, "scaled:center")= num 0.585
## ..- attr(*, "scaled:scale")= num 0.267
## $ scaled_tracking_error : num [1:4881, 1] -0.3746 0.5239 -0.2674 0.1616 -0.0769 ...
## ..- attr(*, "scaled:center")= num 0.0349
## ..- attr(*, "scaled:scale")= num 0.0332
## $ Mkt_beta : num [1:4881] 0.00362 0.01878 0.01514 0.00029 0.00657 ...
## $ SMB_beta : num [1:4881] 0.001857 0.003485 0.005584 -0.000635 0.013586 ...
## $ HML_beta : num [1:4881] -0.00244 0.00175 -0.00202 -0.002 0.00093 ...
## $ company_sizeMedium : num [1:4881] 1 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeSmall : num [1:4881] 0 1 1 1 1 0 0 1 0 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeLarge : num [1:4881] 0 0 0 0 0 1 1 0 1 1 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ security_typeStock : num [1:4881] 1 1 1 1 1 1 1 0 1 1 ...
## ..- attr(*, "assign")= int [1:2] 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ security_type: chr "contr.treatment"
## $ security_typeETF : num [1:4881] 0 0 0 0 0 0 0 1 0 0 ...
## ..- attr(*, "assign")= int [1:2] 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ security_type: chr "contr.treatment"
## $ SECTORConsumer_Staples : num [1:4881] 1 0 0 1 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORUnknown : num [1:4881] 0 1 0 0 1 0 0 1 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORInformation_Technology: num [1:4881] 0 0 1 0 0 1 1 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORFinancials : num [1:4881] 0 0 0 0 0 0 0 0 1 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORIndustrials : num [1:4881] 0 0 0 0 0 0 0 0 0 1 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORHealth_Care : num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORConsumer_Discretionary: num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORMaterials : num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORReal_Estate : num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORUtilities : num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTOREnergy : num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORCommunication_Services: num [1:4881] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 57
## PERMNO returns SHRCD TICKER SECTOR company_size VOL dollar_vol
## <chr> <dbl> <int> <chr> <fct> <fct> <dbl> <dbl>
## 1 10026 0.0192 11 JJSF Consumer_Staples Medium 7.59e6 1.44e 9
## 2 10028 0.126 11 DGSE Unknown Small 4.65e6 6.27e 6
## 3 10032 0.0109 11 PLXS Information_Tech… Small 1.32e7 9.09e 8
## 4 10044 -0.00613 11 RMCF Consumer_Staples Small 6.59e5 5.93e 6
## 5 10051 -0.00417 11 HNGR Unknown Small 3.27e7 7.24e 8
## 6 10104 0.0211 11 ORCL Information_Tech… Large 9.55e8 5.21e10
## 7 10107 0.0744 11 MSFT Information_Tech… Large 1.84e9 2.61e11
## 8 10113 0.0534 73 AADR Unknown Small 6.25e5 3.09e 7
## 9 10138 0.0732 11 TROW Financials Large 7.70e7 8.87e 9
## 10 10145 0.0387 11 HON Industrials Large 2.26e8 3.85e10
## # ℹ 49 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## # volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## # abs_corr_sp <dbl>, security_type <fct>, tracking_error <dbl>,
## # log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## # log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## # log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## # log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …
Make sure returns are accurately calculated
## The date that most PERMNOs has highest price change: 2020-01-17
## The date that most PERMNOs has lowest price change: 2019-12-16
## Number of PERMNOs in the dataset: 5929
## Number of PERMNOs on the date that most PERMNOs has highest price change: 5746
## Number of PERMNOs on the date that most PERMNOs has lowest price change: 5747
## Number of PERMNO that should be include in the dataset but did not: 0
## That PERMNO is:
Correlation of variable
#Find the correlation between retruns and each potential explanatory variables
cor_p1_m1 <- cor(task2_reg_p1_df1 %>%
select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)),
use = "pairwise.complete.obs")
# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p1_m1 <- which(abs(cor_p1_m1) > 0.7, arr.ind = TRUE)
high_corr_pairs_p1_m1 <- data.frame(
Feature1 = rownames(cor_p1_m1)[high_corr_p1_m1[,1]],
Feature2 = colnames(cor_p1_m1)[high_corr_p1_m1[,2]],
Correlation = cor_p1_m1[high_corr_p1_m1]
)
high_corr_pairs_p1_m1 <- high_corr_pairs_p1_m1 %>%
filter(Correlation < 1)
# Inspect the variables with multicollinearity
head(high_corr_pairs_p1_m1)## Feature1 Feature2 Correlation
## 1 log_volatility volatility 0.7562346
## 2 log_tracking_error tracking_error 0.8091045
## 3 log_dollar_vol log_vol 0.9174205
## 4 log_market_cap log_vol 0.8126475
## 5 log_vol log_dollar_vol 0.9174205
## 6 log_spread log_dollar_vol -0.7862752
## Var1 Var2 value
## 1 returns returns 1.000000000
## 2 VOL returns 0.035337402
## 3 dollar_vol returns 0.050754431
## 4 bid_ask_spread returns -0.056033284
## 5 turnover_ratio returns 0.004433236
## 6 volatility returns 0.060627680
Build Model 1
# Splitting the dataset into training and testing sets
set.seed(696)
# Non-COVID Period
X_p1_m1 <- task2_reg_p1_df1 %>%
select(Mkt_beta, SMB_beta, HML_beta, log_turnover_ratio, scaled_spread, log_volatility, log_market_share, vol_cap_ratio, company_sizeSmall, company_sizeLarge, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown)
y_p1_m1 <- task2_reg_p1_df1$returns
splitIndex_p1_m1 <- createDataPartition(y_p1_m1,
p = 0.8,
list = FALSE)
X_train_data_p1_m1 <- X_p1_m1[splitIndex_p1_m1, ]
X_test_data_p1_m1 <- X_p1_m1[-splitIndex_p1_m1, ]
y_train_data_p1_m1 <- y_p1_m1[splitIndex_p1_m1]
y_test_data_p1_m1 <- y_p1_m1[-splitIndex_p1_m1]
train_data_p1_m1 <- cbind(X_train_data_p1_m1, returns = y_train_data_p1_m1)
# Build Model
model_1_p1 <- lm(returns ~ ., data = train_data_p1_m1)
# Prediction for training sample
pred_train_p1_m1 <- predict(model_1_p1, newdata = X_train_data_p1_m1)
metrics_train_p1_m1 <- postResample(pred = pred_train_p1_m1,
obs = train_data_p1_m1$returns)
metrics_table_train_p1_m1 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_train_p1_m1["RMSE"]^2,
metrics_train_p1_m1["RMSE"],
metrics_train_p1_m1["MAE"],
metrics_train_p1_m1["Rsquared"])
)
# Prediction for testing sample
pred_test_p1_m1 <- predict(model_1_p1, newdata = X_test_data_p1_m1)
metrics_test_p1_m1 <- postResample(pred = pred_test_p1_m1,
obs = y_test_data_p1_m1)
metrics_table_test_p1_m1 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_test_p1_m1["RMSE"]^2
, metrics_test_p1_m1["RMSE"], metrics_test_p1_m1["MAE"], metrics_test_p1_m1["Rsquared"])
)## Performance Metrics for the Training Dataset:
## Metric Value
## 1 MSE 0.002234472
## 2 RMSE 0.047270202
## 3 MAE 0.034317481
## 4 R-squared 0.105304918
## Performance Metrics for the Testing Dataset:
## Metric Value
## 1 MSE 0.001913322
## 2 RMSE 0.043741542
## 3 MAE 0.031795969
## 4 R-squared 0.173938416
## Regression coefficient of Model 1:
##
## Call:
## lm(formula = returns ~ ., data = train_data_p1_m1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.190295 -0.022907 -0.000372 0.023547 0.186048
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0323218 0.0065479 4.936 8.30e-07 ***
## Mkt_beta 0.9574882 0.1190404 8.043 1.15e-15 ***
## SMB_beta -0.0984585 0.1025318 -0.960 0.33698
## HML_beta -1.1786067 0.1157057 -10.186 < 2e-16 ***
## log_turnover_ratio 0.0016895 0.0007948 2.126 0.03360 *
## scaled_spread -0.0005114 0.0010723 -0.477 0.63345
## log_volatility 0.0048233 0.0012160 3.967 7.42e-05 ***
## log_market_share 0.0006303 0.0003347 1.883 0.05974 .
## vol_cap_ratio 0.0604965 0.0504603 1.199 0.23064
## company_sizeSmall -0.0054483 0.0022215 -2.453 0.01423 *
## company_sizeLarge 0.0028200 0.0031286 0.901 0.36745
## SECTORMaterials -0.0083186 0.0074315 -1.119 0.26305
## SECTORIndustrials 0.0083327 0.0056437 1.476 0.13990
## SECTORFinancials -0.0141718 0.0053945 -2.627 0.00865 **
## SECTORInformation_Technology 0.0162510 0.0057003 2.851 0.00438 **
## SECTORConsumer_Discretionary 0.0024951 0.0057585 0.433 0.66482
## SECTORHealth_Care 0.0080007 0.0056505 1.416 0.15688
## SECTOREnergy 0.0124139 0.0080301 1.546 0.12221
## SECTORUtilities 0.0112863 0.0080688 1.399 0.16197
## SECTORReal_Estate -0.0013079 0.0114142 -0.115 0.90878
## SECTORCommunication_Services 0.0323416 0.0075503 4.284 1.88e-05 ***
## SECTORUnknown 0.0103622 0.0050517 2.051 0.04031 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0474 on 3883 degrees of freedom
## Multiple R-squared: 0.1053, Adjusted R-squared: 0.1005
## F-statistic: 21.76 on 21 and 3883 DF, p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
## Mkt_beta SMB_beta
## 1.319676 1.419217
## HML_beta log_turnover_ratio
## 1.291541 1.474125
## scaled_spread log_volatility
## 1.551150 2.234558
## log_market_share vol_cap_ratio
## 1.662150 1.072415
## company_sizeSmall company_sizeLarge
## 1.708948 1.532598
## SECTORMaterials SECTORIndustrials
## 1.689553 3.415319
## SECTORFinancials SECTORInformation_Technology
## 4.525347 3.307744
## SECTORConsumer_Discretionary SECTORHealth_Care
## 3.168057 3.558749
## SECTOREnergy SECTORUtilities
## 1.639638 1.542901
## SECTORReal_Estate SECTORCommunication_Services
## 1.210997 1.646023
## SECTORUnknown
## 10.973571
## Despite SectorUnknown has a VIF over 5, considering it is actually the group of data with missing value in Sector, while some other industry categories are important to the model. After trial and error, it is better to keep the industry variable in the model.
# Scatterplot of Prediction vs Actual
plot_data_p1_m1 <- data.frame(
Predicted = pred_test_p1_m1,
Actual = y_test_data_p1_m1
)
ggplot(data = plot_data_p1_m1,
aes(x = Predicted,
y = Actual)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Evaluating Model 1's Performance: Predicted vs. Actual Returns in the Non-COVID Period",
x = "Predicted returns",
y = "Actual returns")Important variable
# Extract coefficients
coef_m1 <- coef(model_1_p1)
importance_m1 <- data.frame(
Factors = names(coef_m1),
Coefficient = coef_m1
)
importance_m1 <- importance_m1[importance_m1$Factor != "(Intercept)", ]
# Plot with positive and negative impacts
plot_m1_p1_factors <- ggplot(importance_m1,
aes(x = reorder(Factors, Coefficient),
y = Coefficient, fill = Coefficient > 0)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Coefficient, 3)),
vjust = ifelse(importance_m1$Coefficient > 0, 0.5, 0.3),
color = "black") +
coord_flip() +
labs(title = "Impact of factors on Returns for Model 1",
x = "Factors",
y = "Coefficient") +
theme_minimal() +
scale_fill_manual(
values = c("TRUE" = "steelblue", "FALSE" = "coral"),
labels = c("TRUE" = "Positive Impact",
"FALSE" = "Negative Impact"),
name = ""
) +
theme(
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
)
print(plot_m1_p1_factors)Group by Sector
# Apply the model particularly by industry
pred_sector_p1_m1 <- cbind(X_test_data_p1_m1, SECTOR = task2_reg_p1_df1$SECTOR[-splitIndex_p1_m1])
data_sector_p1_m1 <- data.frame(
Predicted = pred_test_p1_m1,
Actual = y_test_data_p1_m1,
SECTOR = pred_sector_p1_m1$SECTOR
)
data_sector_p1_m1$SECTOR <- as.character(data_sector_p1_m1$SECTOR)
# Calculate evaluation metrics to see the performance of model apply on SECTOR
metrics_table_sector_p1_m1 <- data_sector_p1_m1 %>%
group_by(SECTOR) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_sector_p1_m1)## # A tibble: 12 × 5
## SECTOR MSE RMSE MAE R_Squared
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Communication_Services 0.00436 0.0661 0.0553 0.0987
## 2 Consumer_Discretionary 0.00300 0.0548 0.0441 0.0622
## 3 Consumer_Staples 0.00273 0.0523 0.0410 -0.379
## 4 Energy 0.00422 0.0650 0.0533 -0.679
## 5 Financials 0.00146 0.0382 0.0318 0.0991
## 6 Health_Care 0.00360 0.0600 0.0476 0.0543
## 7 Industrials 0.00262 0.0512 0.0385 -0.00432
## 8 Information_Technology 0.00269 0.0519 0.0420 0.120
## 9 Materials 0.00302 0.0550 0.0447 0.0163
## 10 Real_Estate 0.00418 0.0647 0.0575 0.297
## 11 Unknown 0.00128 0.0358 0.0245 0.208
## 12 Utilities 0.00315 0.0561 0.0442 -0.225
# Plot the prediction vs actual by SECTOR
plot_m1_p1_sector <- ggplot(data = data_sector_p1_m1,
aes(x = Predicted,
y = Actual,
color = SECTOR)) +
geom_point(size = 2) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 1's Performance: Predicted vs. Actual Returns",
subtitle = "During the Non-COVID Period (Grouped by Industry Sector)",
x = "Predicted returns",
y = "Actual returns",
color = "Industry Sector")
print(plot_m1_p1_sector)Group by Security Type
# Apply the model particularly by security type
pred_type_p1_m1 <- cbind(X_test_data_p1_m1, security_type = task2_reg_p1_df1$security_type[-splitIndex_p1_m1])
data_type_p1_m1 <- data.frame(
Predicted = pred_test_p1_m1,
Actual = y_test_data_p1_m1,
security_type = pred_type_p1_m1$security_type
)
# Calculate evaluation metrics to see the performance of model apply on security type
metrics_table_type_p1_m1 <- data_type_p1_m1 %>%
group_by(security_type) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_type_p1_m1)## # A tibble: 2 × 5
## security_type MSE RMSE MAE R_Squared
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Stock 0.00286 0.0534 0.0421 0.134
## 2 ETF 0.000712 0.0267 0.0187 0.299
# Plot the prediction vs actual by security type
plot_m1_p1_type <- ggplot(data = data_type_p1_m1,
aes(x = Predicted,
y = Actual,
color = security_type)) +
geom_point(size = 2) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 1's Performance: Predicted vs. Actual Returns",
subtitle = "During the Non-COVID Period (Grouped by Security Type)",
x = "Predicted returns",
y = "Actual returns",
color = "Security Type")
print(plot_m1_p1_type)Group by Company size
# Apply the model particularly by company size
pred_size_p1_m1 <- cbind(X_test_data_p1_m1, company_size = task2_reg_p1_df1$company_size[-splitIndex_p1_m1])
data_size_p1_m1 <- data.frame(
Predicted = pred_test_p1_m1,
Actual = y_test_data_p1_m1,
company_size = pred_size_p1_m1$company_size
)
# Calculate evaluation metrics to see the performance of model apply on company size
metrics_table_size_p1_m1 <- data_size_p1_m1 %>%
group_by(company_size) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_size_p1_m1)## # A tibble: 3 × 5
## company_size MSE RMSE MAE R_Squared
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Medium 0.00171 0.0414 0.0317 0.167
## 2 Small 0.00197 0.0443 0.0317 0.138
## 3 Large 0.00185 0.0431 0.0328 0.129
# Plot the prediction vs actual by company size
plot_m1_p1_size <- ggplot(data = data_size_p1_m1,
aes(x = Predicted,
y = Actual,
color = company_size)) +
geom_point(size = 2, alpha = 0.7) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 1's Performance: Predicted vs. Actual Returns",
subtitle = "During the Non-COVID Period (Grouped by Company Size)",
x = "Predicted returns",
y = "Actual returns",
color = "Company Size")
print(plot_m1_p1_size)Model 2: Returns: 2020-02-14 to 2020-03-20 & X variables: before 2019-02-14
# Period 1 - Dataset 1
task2_p2_df1_1 <- transform_dataset_for_task2(task2, "2020-02-14", "2020-03-20", "2019-08-20", "2020-02-14", "both", FFF_beta_covid)## Highest Date: [1] "2020-02-14"
##
## Lowest Date: [1] "2020-03-18"
##
## Number of PERMNO in the dataset that calculated returns: 5764
## Number of PERMNO that are in the dataset that aggregate x variables: 5924
## Number of PERMNO that are in the both datasets: 5763
## Number of PERMNO that are in the final dataset: 5708
# Remove outlier for returns
task2_p2_df1_1 <- remove_outliers_all(task2_p2_df1_1, "returns")
# Inspect the distribution of Returns
print(
ggplot(task2_p2_df1_1, aes(x = returns)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)## Sample size after remove outlier: 5235
# Create dummy variable for categorical variable
task2_reg_p2_df1 <- task2_p2_df1_1 %>%
filter(!is.na(volatility)) %>%
mutate(company_size = factor(company_size, levels = unique(task2_p2_df1_1$company_size)),
security_type = factor(security_type, levels = unique(task2_p2_df1_1$security_type)),
SECTOR = factor(SECTOR, levels = unique(task2_p2_df1_1$SECTOR))) %>%
bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
bind_cols(as_tibble(model.matrix(~ security_type - 1, data = .))) %>%
bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .))) ## tibble [5,190 × 57] (S3: tbl_df/tbl/data.frame)
## $ PERMNO : chr [1:5190] "10026" "10028" "10032" "10044" ...
## $ returns : num [1:5190] 0.5036 0.0106 0.9654 0.5507 1.0622 ...
## $ SHRCD : int [1:5190] 11 11 11 11 11 11 11 73 11 11 ...
## $ TICKER : chr [1:5190] "JJSF" "DGSE" "PLXS" "RMCF" ...
## $ SECTOR : Factor w/ 12 levels "Consumer_Staples",..: 1 2 3 1 2 3 3 2 4 5 ...
## $ company_size : Factor w/ 3 levels "Medium","Small",..: 1 2 2 2 2 3 3 2 3 3 ...
## $ VOL : num [1:5190] 11872557 9713299 20878422 1408090 42541489 ...
## $ dollar_vol : num [1:5190] 2.20e+09 1.62e+07 1.50e+09 1.25e+07 9.85e+08 ...
## $ bid_ask_spread : num [1:5190] 0.001024 0.014651 0.000675 0.011492 0.000678 ...
## $ turnover_ratio : num [1:5190] 0.00507 0.00291 0.00577 0.00189 0.0092 ...
## $ volatility : num [1:5190] 0.195 0.618 0.238 0.15 0.296 ...
## $ sharpe_ratio : num [1:5190] -137.3 -41.7 -109.6 -162.8 -88.8 ...
## $ market_share : num [1:5190] 0.0422 0.024 0.0359 0.3338 0.0776 ...
## $ market_cap : num [1:5190] 4.36e+11 4.72e+09 2.55e+11 6.66e+09 1.09e+11 ...
## $ abs_corr_sp : num [1:5190] 0.72 0.318 0.635 0.193 0.595 ...
## $ security_type : Factor w/ 2 levels "Stock","ETF": 1 1 1 1 1 1 1 2 1 1 ...
## $ tracking_error : num [1:5190] 0.0225 0.0523 0.0261 0.0403 0.0324 ...
## $ log_vol : num [1:5190] 16.3 16.1 16.9 14.2 17.6 ...
## $ log_dollar_vol : num [1:5190] 21.5 16.6 21.1 16.3 20.7 ...
## $ log_spread : num [1:5190] -6.88 -4.22 -7.3 -4.47 -7.29 ...
## $ log_turnover_ratio : num [1:5190] -5.28 -5.84 -5.15 -6.27 -4.69 ...
## $ log_volatility : num [1:5190] -1.637 -0.482 -1.434 -1.896 -1.218 ...
## $ log_market_share : num [1:5190] -3.16 -3.73 -3.33 -1.1 -2.56 ...
## $ log_market_cap : num [1:5190] 26.8 22.3 26.3 22.6 25.4 ...
## $ log_tracking_error : num [1:5190] -3.79 -2.95 -3.65 -3.21 -3.43 ...
## $ vol_cap_ratio : num [1:5190] 2.72e-05 2.06e-03 8.18e-05 2.11e-04 3.91e-04 ...
## $ log_vol_cap_ratio : num [1:5190] -10.51 -6.19 -9.41 -8.46 -7.85 ...
## $ scaled_vol : num [1:5190, 1] -0.255 -0.26 -0.231 -0.282 -0.174 ...
## ..- attr(*, "scaled:center")= num 1.09e+08
## ..- attr(*, "scaled:scale")= num 3.8e+08
## $ scaled_dollar_vol : num [1:5190, 1] -0.0898 -0.1428 -0.1068 -0.1429 -0.1193 ...
## ..- attr(*, "scaled:center")= num 5.9e+09
## ..- attr(*, "scaled:scale")= num 4.12e+10
## $ scaled_spread : num [1:5190, 1] -0.38 0.871 -0.412 0.581 -0.411 ...
## ..- attr(*, "scaled:center")= num 0.00516
## ..- attr(*, "scaled:scale")= num 0.0109
## $ scaled_turnover_ratio : num [1:5190, 1] -0.0606 -0.0707 -0.0573 -0.0755 -0.0412 ...
## ..- attr(*, "scaled:center")= num 0.018
## ..- attr(*, "scaled:scale")= num 0.213
## $ scaled_volatility : num [1:5190, 1] -0.399 0.764 -0.279 -0.521 -0.121 ...
## ..- attr(*, "scaled:center")= num 0.34
## ..- attr(*, "scaled:scale")= num 0.364
## $ scaled_sharpe_ratio : num [1:5190, 1] 0.15 0.291 0.191 0.113 0.222 ...
## ..- attr(*, "scaled:center")= num -240
## ..- attr(*, "scaled:scale")= num 681
## $ scaled_market_share : num [1:5190, 1] -0.147 -0.149 -0.148 -0.117 -0.143 ...
## ..- attr(*, "scaled:center")= num 1.45
## ..- attr(*, "scaled:scale")= num 9.55
## $ scaled_market_cap : num [1:5190, 1] -0.0686 -0.1709 -0.1115 -0.1705 -0.1463 ...
## ..- attr(*, "scaled:center")= num 7.25e+11
## ..- attr(*, "scaled:scale")= num 4.22e+12
## $ scaled_abs_corr_sp : num [1:5190, 1] 0.5053 -0.9915 0.1883 -1.4588 0.0384 ...
## ..- attr(*, "scaled:center")= num 0.584
## ..- attr(*, "scaled:scale")= num 0.268
## $ scaled_tracking_error : num [1:5190, 1] -0.3752 0.5248 -0.2678 0.1619 -0.0769 ...
## ..- attr(*, "scaled:center")= num 0.0349
## ..- attr(*, "scaled:scale")= num 0.0331
## $ Mkt_beta : num [1:5190] 0.0017 0.014261 0.014132 0.000384 0.003971 ...
## $ SMB_beta : num [1:5190] 0.00197 -0.00226 0.00549 0.00192 0.01299 ...
## $ HML_beta : num [1:5190] -0.000521 0.000203 0.000435 -0.002509 0.002974 ...
## $ company_sizeMedium : num [1:5190] 1 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeSmall : num [1:5190] 0 1 1 1 1 0 0 1 0 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeLarge : num [1:5190] 0 0 0 0 0 1 1 0 1 1 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ security_typeStock : num [1:5190] 1 1 1 1 1 1 1 0 1 1 ...
## ..- attr(*, "assign")= int [1:2] 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ security_type: chr "contr.treatment"
## $ security_typeETF : num [1:5190] 0 0 0 0 0 0 0 1 0 0 ...
## ..- attr(*, "assign")= int [1:2] 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ security_type: chr "contr.treatment"
## $ SECTORConsumer_Staples : num [1:5190] 1 0 0 1 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORUnknown : num [1:5190] 0 1 0 0 1 0 0 1 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORInformation_Technology: num [1:5190] 0 0 1 0 0 1 1 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORFinancials : num [1:5190] 0 0 0 0 0 0 0 0 1 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORIndustrials : num [1:5190] 0 0 0 0 0 0 0 0 0 1 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORHealth_Care : num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORConsumer_Discretionary: num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORMaterials : num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORReal_Estate : num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORUtilities : num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTOREnergy : num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORCommunication_Services: num [1:5190] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 57
## PERMNO returns SHRCD TICKER SECTOR company_size VOL dollar_vol
## <chr> <dbl> <int> <chr> <fct> <fct> <dbl> <dbl>
## 1 10026 0.504 11 JJSF Consumer_Staples Medium 1.19e7 2.20e 9
## 2 10028 0.0106 11 DGSE Unknown Small 9.71e6 1.62e 7
## 3 10032 0.965 11 PLXS Information_Techn… Small 2.09e7 1.50e 9
## 4 10044 0.551 11 RMCF Consumer_Staples Small 1.41e6 1.25e 7
## 5 10051 1.06 11 HNGR Unknown Small 4.25e7 9.85e 8
## 6 10104 0.173 11 ORCL Information_Techn… Large 1.37e9 7.47e10
## 7 10107 0.320 11 MSFT Information_Techn… Large 3.00e9 4.57e11
## 8 10113 0.641 73 AADR Unknown Small 1.38e6 7.21e 7
## 9 10138 0.267 11 TROW Financials Large 1.21e8 1.46e10
## 10 10145 0.512 11 HON Industrials Large 3.35e8 5.77e10
## # ℹ 49 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## # volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## # abs_corr_sp <dbl>, security_type <fct>, tracking_error <dbl>,
## # log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## # log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## # log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## # log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …
To make sure returns are accurately calculated
# Find the date with the most PERMNOs having the highest price change
highest_change_covid <- task2 %>%
filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
group_by(PERMNO) %>%
filter(PRC == max(PRC)) %>%
ungroup() %>%
group_by(date) %>%
summarise(num_highest = n()) %>%
arrange(desc(num_highest))
# Find the date with the most PERMNOs having the lowest price change
lowest_change_covid <- task2 %>%
filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
group_by(PERMNO) %>%
filter(PRC == min(PRC)) %>%
ungroup() %>%
group_by(date) %>%
summarise(num_lowest = n()) %>%
arrange(desc(num_lowest))
# Inspect the results
returns_covid <- task2 %>%
filter(date == highest_change_covid$date[1] | date == lowest_change_covid$date[1]) %>%
group_by(PERMNO) %>%
summarise(max_change = PRC[date == highest_change_covid$date[1]],
min_change = PRC[date == lowest_change_covid$date[1]],
returns_covid = (max_change - min_change) / min_change) %>%
select(PERMNO, returns_covid)
comparison_covid <- returns_covid %>%
left_join(task2_reg_p2_df1, by = "PERMNO") %>%
mutate(difference_covid = returns_covid != returns) %>%
filter(difference_covid) ## The date that most PERMNOs has highest price change: 2020-02-14
## The date that most PERMNOs has lowest price change: 2020-03-18
## Number of PERMNOs in the dataset: 5929
## Number of PERMNOs on the date that most PERMNOs has highest price change: 5762
## Number of PERMNOs on the date that most PERMNOs has lowest price change: 5711
## Number of PERMNO that should be include in the dataset but did not: 0
## That PERMNO is:
Correlation of variable
#Find the correlation between retruns and each potential explanatory variables
cor_p2_m1 <- cor(task2_reg_p2_df1%>% select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)), use = "pairwise.complete.obs")
# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p2_m1 <- which(abs(cor_p2_m1) > 0.7, arr.ind = TRUE)
high_corr_pairs_p2_m1 <- data.frame(
Feature1 = rownames(cor_p2_m1)[high_corr_p2_m1[,1]],
Feature2 = colnames(cor_p2_m1)[high_corr_p2_m1[,2]],
Correlation = cor_p2_m1[high_corr_p2_m1]
)
high_corr_pairs_p2_m1 <- high_corr_pairs_p2_m1 %>% filter(Correlation < 1)
# Inspect the variables with multicollinearity
head(high_corr_pairs_p2_m1)## Feature1 Feature2 Correlation
## 1 log_spread bid_ask_spread 0.7156169
## 2 tracking_error volatility 0.7661033
## 3 log_volatility volatility 0.7777328
## 4 log_tracking_error volatility 0.7001564
## 5 scaled_tracking_error volatility 0.7661033
## 6 log_tracking_error abs_corr_sp -0.7654664
## Var1 Var2 value
## 1 returns returns 1.00000000
## 2 VOL returns 0.03233583
## 3 dollar_vol returns -0.02082236
## 4 bid_ask_spread returns 0.01411746
## 5 turnover_ratio returns 0.01368088
## 6 volatility returns 0.24193335
Build Model 2
# Splitting the dataset into training and testing sets
set.seed(781)
# COVID Period
X_p2_m2 <- task2_reg_p2_df1 %>%
select(Mkt_beta, SMB_beta, HML_beta, log_volatility, log_spread, scaled_market_share, scaled_abs_corr_sp, scaled_dollar_vol, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown)
y_p2_m2 <- task2_reg_p2_df1$returns
splitIndex_p2_m2 <- createDataPartition(y_p2_m2,
p = 0.8,
list = FALSE)
X_train_data_p2_m2 <- X_p2_m2[splitIndex_p2_m2, ]
X_test_data_p2_m2 <- X_p2_m2[-splitIndex_p2_m2, ]
y_train_data_p2_m2 <- y_p2_m2[splitIndex_p2_m2]
y_test_data_p2_m2 <- y_p2_m2[-splitIndex_p2_m2]
train_data_p2_m2 <- cbind(X_train_data_p2_m2, returns = y_train_data_p2_m2)
# Build Model
model_2_p2 <- lm(returns ~ ., data = train_data_p2_m2)
# Prediction for training sample - Covid
pred_train_p2_m2 <- predict(model_2_p2, newdata = X_train_data_p2_m2)
metrics_train_p2_m2 <- postResample(pred = pred_train_p2_m2,
obs = train_data_p2_m2$returns)
metrics_table_train_p2_m2 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_train_p2_m2["RMSE"]^2,
metrics_train_p2_m2["RMSE"],
metrics_train_p2_m2["MAE"],
metrics_train_p2_m2["Rsquared"])
)
# Prediction for testing sample - covid
pred_test_p2_m2 <- predict(model_2_p2, newdata = X_test_data_p2_m2)
metrics_test_p2_m2 <- postResample(pred = pred_test_p2_m2,
obs = y_test_data_p2_m2)
metrics_table_test_p2_m2 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_test_p2_m2["RMSE"]^2
, metrics_test_p2_m2["RMSE"], metrics_test_p2_m2["MAE"], metrics_test_p2_m2["Rsquared"])
)## Performance Metrics for the Training Dataset:
## Metric Value
## 1 MSE 0.1240295
## 2 RMSE 0.3521782
## 3 MAE 0.2591119
## 4 R-squared 0.2638832
## Performance Metrics for the Testing Dataset:
## Metric Value
## 1 MSE 0.1077629
## 2 RMSE 0.3282726
## 3 MAE 0.2462835
## 4 R-squared 0.2922223
## Regression coefficient of Model 1:
##
## Call:
## lm(formula = returns ~ ., data = train_data_p2_m2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.40345 -0.19946 -0.03503 0.16097 1.28868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.591175 0.047373 12.479 < 2e-16 ***
## Mkt_beta 11.393483 0.972606 11.714 < 2e-16 ***
## SMB_beta 4.398360 0.644666 6.823 1.02e-11 ***
## HML_beta 3.126379 0.852140 3.669 0.000247 ***
## log_volatility 0.139262 0.007828 17.790 < 2e-16 ***
## log_spread 0.005219 0.005039 1.036 0.300402
## scaled_market_share -0.006909 0.005626 -1.228 0.219476
## scaled_abs_corr_sp 0.061086 0.006986 8.744 < 2e-16 ***
## scaled_dollar_vol -0.007637 0.005266 -1.450 0.147055
## SECTORMaterials 0.141054 0.053298 2.647 0.008163 **
## SECTORIndustrials 0.139443 0.040457 3.447 0.000573 ***
## SECTORFinancials 0.193201 0.039455 4.897 1.01e-06 ***
## SECTORInformation_Technology 0.121068 0.040387 2.998 0.002737 **
## SECTORConsumer_Discretionary 0.302509 0.042456 7.125 1.22e-12 ***
## SECTORHealth_Care 0.034379 0.038975 0.882 0.377779
## SECTOREnergy 0.209086 0.061752 3.386 0.000716 ***
## SECTORUtilities 0.030553 0.061258 0.499 0.617980
## SECTORReal_Estate 0.312307 0.084514 3.695 0.000222 ***
## SECTORCommunication_Services 0.229145 0.053253 4.303 1.72e-05 ***
## SECTORUnknown 0.118594 0.036009 3.293 0.000998 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.353 on 4134 degrees of freedom
## Multiple R-squared: 0.2639, Adjusted R-squared: 0.2605
## F-statistic: 78 on 19 and 4134 DF, p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
## Mkt_beta SMB_beta
## 1.389091 1.233495
## HML_beta log_volatility
## 1.172159 2.112182
## log_spread scaled_market_share
## 1.751305 1.127001
## scaled_abs_corr_sp scaled_dollar_vol
## 1.618946 1.147129
## SECTORMaterials SECTORIndustrials
## 1.678586 3.361116
## SECTORFinancials SECTORInformation_Technology
## 4.065494 3.507963
## SECTORConsumer_Discretionary SECTORHealth_Care
## 2.740303 4.523545
## SECTOREnergy SECTORUtilities
## 1.511425 1.457989
## SECTORReal_Estate SECTORCommunication_Services
## 1.197435 1.697703
## SECTORUnknown
## 10.732340
## Despite SectorUnknown has a VIF over 5, considering it is actually the group of data with missing value in Sector, while some other industry categories are important to the model. After trial and error, it is better to keep the industry variable in the model.
# Scatterplot of Prediction vs Actual
plot_data_p2_m2 <- data.frame(
Predicted = pred_test_p2_m2,
Actual = y_test_data_p2_m2
)
plot_m2_p2 <- ggplot(data = plot_data_p2_m2,
aes(x = Predicted,
y = Actual)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Evaluating Model 2's Performance: Predicted vs. Actual Returns in the Non-covid Period",
x = "Predicted returns",
y = "Actual returns")
print(plot_m2_p2)Important variable
# Extract coefficients
coef_m2 <- coef(model_2_p2)
importance_m2 <- data.frame(
Factors = names(coef_m2),
Coefficient = coef_m2
)
importance_m2 <- importance_m2[importance_m2$Factor != "(Intercept)", ]
# Plot with positive and negative impacts
plot_m2_p2_factors <- ggplot(importance_m2, aes(x = reorder(Factors, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Coefficient, 3)),
vjust = ifelse(importance_m2$Coefficient > 0, 0.5, 0.3),
color = "black") +
coord_flip() +
labs(title = "Impact of factors on returns for Model 2",
x = "Factors",
y = "Coefficient") +
theme_minimal() +
scale_fill_manual(
values = c("TRUE" = "steelblue", "FALSE" = "coral"),
labels = c("TRUE" = "Positive Impact",
"FALSE" = "Negative Impact"),
name = ""
) +
theme(
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
) Group by Sector
pred_sector_p2_m2 <- cbind(X_test_data_p2_m2, SECTOR = task2_reg_p2_df1$SECTOR[-splitIndex_p2_m2])
data_sector_p2_m2 <- data.frame(
Predicted = pred_test_p2_m2,
Actual = y_test_data_p2_m2,
SECTOR = pred_sector_p2_m2$SECTOR
)
data_sector_p2_m2$SECTOR <- as.character(data_sector_p2_m2$SECTOR)
# Calculating performance metrics by sector
metrics_table_sector_p2_m2 <- data_sector_p2_m2 %>%
group_by(SECTOR) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_sector_p2_m2)## # A tibble: 12 × 5
## SECTOR MSE RMSE MAE R_Squared
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Communication_Services 0.129 0.359 0.291 -0.149
## 2 Consumer_Discretionary 0.137 0.370 0.310 0.126
## 3 Consumer_Staples 0.156 0.395 0.321 0.192
## 4 Energy 0.227 0.476 0.433 0.0855
## 5 Financials 0.0965 0.311 0.248 0.0482
## 6 Health_Care 0.184 0.428 0.339 0.0978
## 7 Industrials 0.158 0.398 0.331 0.0263
## 8 Information_Technology 0.127 0.357 0.287 -0.000730
## 9 Materials 0.125 0.353 0.276 -0.399
## 10 Real_Estate 0.0692 0.263 0.209 0.240
## 11 Unknown 0.0824 0.287 0.201 0.366
## 12 Utilities 0.0597 0.244 0.225 0.000703
plot_m2_p2_sector <- ggplot(data = data_sector_p2_m2,
aes(x = Predicted,
y = Actual,
color = SECTOR)) +
geom_point(size = 2) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 2's Performance: Predicted vs. Actual Returns",
subtitle = "During the COVID Period (Grouped by Industry Sector)",
x = "Predicted returns",
y = "Actual returns",
color = "Industry Sector")
print(plot_m2_p2_sector)Group by Security Type
pred_type_p2_m2 <- cbind(X_test_data_p2_m2, security_type = task2_reg_p2_df1$security_type[-splitIndex_p2_m2])
data_type_p2_m2 <- data.frame(
Predicted = pred_test_p2_m2,
Actual = y_test_data_p2_m2,
security_type = pred_type_p2_m2$security_type
)
# Calculating performance metrics by sector
metrics_table_type_p2_m2 <- data_type_p2_m2 %>%
group_by(security_type) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_type_p2_m2)## # A tibble: 2 × 5
## security_type MSE RMSE MAE R_Squared
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Stock 0.141 0.375 0.301 0.159
## 2 ETF 0.0555 0.236 0.160 0.378
plot_m2_p2_type <- ggplot(data = data_type_p2_m2,
aes(x = Predicted,
y = Actual,
color = security_type)) +
geom_point(size = 2) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 2's Performance: Predicted vs. Actual Returns",
subtitle = "During the COVID Period (Grouped by Security Type)",
x = "Predicted returns",
y = "Actual returns",
color = "Security Type")
print(plot_m2_p2_type)Group by Company size
pred_size_p2_m2 <- cbind(X_test_data_p2_m2, company_size = task2_reg_p2_df1$company_size[-splitIndex_p2_m2])
data_size_p2_m2 <- data.frame(
Predicted = pred_test_p2_m2,
Actual = y_test_data_p2_m2,
company_size = pred_size_p2_m2$company_size
)
# Calculating performance metrics by sector
metrics_table_size_p2_m2 <- data_size_p2_m2 %>%
group_by(company_size) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_size_p2_m2)## # A tibble: 3 × 5
## company_size MSE RMSE MAE R_Squared
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Medium 0.108 0.329 0.260 0.265
## 2 Small 0.111 0.334 0.247 0.285
## 3 Large 0.0726 0.269 0.214 0.371
plot_m2_p2_size <- ggplot(data = data_size_p2_m2,
aes(x = Predicted,
y = Actual,
color = company_size)) +
geom_point(size = 2, alpha = 0.7) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 2's Performance: Predicted vs. Actual Returns",
subtitle = "During the COVID Period (Grouped by Company Size)",
x = "Predicted returns",
y = "Actual returns",
color = "Company Size")
print(plot_m2_p2_size)Model 3: (ETFs) Returns: 2019-12-14 to 2020-01-20 & X variables: before 2019-02-14
# Period 1 - Dataset 2
task2_p1_df2_1 <- transform_dataset_for_task2(task2, "2019-12-14", "2020-01-20", "2019-08-20", "2019-12-14", "ETFs", FFF_beta_noncovid) ## Highest Date: [1] "2020-01-17"
##
## Lowest Date: [1] "2019-12-16"
##
## Number of PERMNO in the dataset that calculated returns: 5790
## Number of PERMNO that are in the dataset that aggregate x variables: 2180
## Number of PERMNO that are in the both datasets: 2148
## Number of PERMNO that are in the final dataset: 2129
# Remove outlier for returns
task2_p1_df2_1 <- remove_outliers_all(task2_p1_df2_1, "returns")
# Inspect the distribution of Returns
print(
ggplot(task2_p1_df2_1, aes(x = returns)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)## Sample size after remove outlier: 1990
# Create dummy variable for categorical variable
task2_reg_p1_df2 <- task2_p1_df2_1 %>%
filter(!is.na(volatility)) %>%
mutate(company_size = factor(company_size, levels = unique(task2_p1_df2_1$company_size)),
SECTOR = factor(SECTOR, levels = unique(task2_p1_df2_1$SECTOR))) %>%
bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .))) ## tibble [1,964 × 49] (S3: tbl_df/tbl/data.frame)
## $ PERMNO : chr [1:1964] "10113" "11182" "11264" "11996" ...
## $ returns : num [1:1964] 0.05341 0.01247 0.09027 0.00653 0.06506 ...
## $ SHRCD : int [1:1964] 73 73 73 73 73 73 73 73 73 73 ...
## $ TICKER : chr [1:1964] "AADR" "GASL" "RETL" "EMLC" ...
## $ SECTOR : Factor w/ 6 levels "Unknown","Financials",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ company_size : Factor w/ 3 levels "Small","Medium",..: 1 1 1 2 2 2 2 2 1 1 ...
## $ VOL : num [1:1964] 6.25e+05 1.58e+08 3.03e+06 1.79e+08 1.75e+09 ...
## $ dollar_vol : num [1:1964] 3.09e+07 1.21e+09 7.16e+07 5.95e+09 1.49e+10 ...
## $ bid_ask_spread : num [1:1964] 0.003669 0.001949 0.00127 0.000446 0.001215 ...
## $ turnover_ratio : num [1:1964] 0.0036 0.4615 0.0612 0.0147 0.0234 ...
## $ volatility : num [1:1964] 0.1116 1.2453 0.5851 0.0666 0.1401 ...
## $ sharpe_ratio : num [1:1964] -228.4 -20.2 -42.9 -381.5 -187 ...
## $ market_share : num [1:1964] 0.00027 0.19917 0.05839 0.11148 1.2461 ...
## $ market_cap : num [1:1964] 8.59e+09 2.70e+09 1.17e+09 4.05e+11 6.44e+11 ...
## $ abs_corr_sp : num [1:1964] 0.834 0.578 0.841 0.638 0.605 ...
## $ security_type : chr [1:1964] "ETF" "ETF" "ETF" "ETF" ...
## $ tracking_error : num [1:1964] 0.012 0.1097 0.0557 0.0165 0.0305 ...
## $ log_vol : num [1:1964] 13.3 18.9 14.9 19 21.3 ...
## $ log_dollar_vol : num [1:1964] 17.2 20.9 18.1 22.5 23.4 ...
## $ log_spread : num [1:1964] -5.61 -6.24 -6.67 -7.71 -6.71 ...
## $ log_turnover_ratio : num [1:1964] -5.626 -0.773 -2.794 -4.221 -3.756 ...
## $ log_volatility : num [1:1964] -2.192 0.219 -0.536 -2.709 -1.965 ...
## $ log_market_share : num [1:1964] -8.21 -1.61 -2.84 -2.19 0.22 ...
## $ log_market_cap : num [1:1964] 22.9 21.7 20.9 26.7 27.2 ...
## $ log_tracking_error : num [1:1964] -4.42 -2.21 -2.89 -4.1 -3.49 ...
## $ vol_cap_ratio : num [1:1964] 7.28e-05 5.85e-02 2.58e-03 4.41e-04 2.71e-03 ...
## $ log_vol_cap_ratio : num [1:1964] -9.53 -2.84 -5.96 -7.73 -5.91 ...
## $ scaled_vol : num [1:1964, 1] -0.171 0.435 -0.162 0.516 6.569 ...
## ..- attr(*, "scaled:center")= num 44874771
## ..- attr(*, "scaled:scale")= num 2.59e+08
## $ scaled_dollar_vol : num [1:1964, 1] -0.0852 -0.0517 -0.084 0.0828 0.3359 ...
## ..- attr(*, "scaled:center")= num 3.03e+09
## ..- attr(*, "scaled:scale")= num 3.52e+10
## $ scaled_spread : num [1:1964, 1] 0.5702 -0.0831 -0.3413 -0.654 -0.3622 ...
## ..- attr(*, "scaled:center")= num 0.00217
## ..- attr(*, "scaled:scale")= num 0.00263
## $ scaled_turnover_ratio : num [1:1964, 1] -0.2741 7.5801 0.7132 -0.0839 0.065 ...
## ..- attr(*, "scaled:center")= num 0.0196
## ..- attr(*, "scaled:scale")= num 0.0583
## $ scaled_volatility : num [1:1964, 1] -0.1487 9.715 3.9712 -0.5408 0.0992 ...
## ..- attr(*, "scaled:center")= num 0.129
## ..- attr(*, "scaled:scale")= num 0.115
## $ scaled_sharpe_ratio : num [1:1964, 1] 0.2163 0.4132 0.3918 0.0716 0.2555 ...
## ..- attr(*, "scaled:center")= num -457
## ..- attr(*, "scaled:scale")= num 1057
## $ scaled_market_share : num [1:1964, 1] -0.1813 -0.0229 -0.135 -0.0927 0.8103 ...
## ..- attr(*, "scaled:center")= num 0.228
## ..- attr(*, "scaled:scale")= num 1.26
## $ scaled_market_cap : num [1:1964, 1] -0.179 -0.186 -0.188 0.309 0.603 ...
## ..- attr(*, "scaled:center")= num 1.54e+11
## ..- attr(*, "scaled:scale")= num 8.12e+11
## $ scaled_abs_corr_sp : num [1:1964, 1] 0.335 -0.654 0.363 -0.42 -0.55 ...
## ..- attr(*, "scaled:center")= num 0.747
## ..- attr(*, "scaled:scale")= num 0.259
## $ scaled_tracking_error : num [1:1964, 1] -0.29246 5.86143 2.45606 -0.00806 0.86839 ...
## ..- attr(*, "scaled:center")= num 0.0167
## ..- attr(*, "scaled:scale")= num 0.0159
## $ Mkt_beta : num [1:1964] 0.0065 0.03727 0.03821 0.00104 0.00481 ...
## $ SMB_beta : num [1:1964] -0.00132 0.04669 0.012 -0.00107 0.00296 ...
## $ HML_beta : num [1:1964] -0.00489 0.059424 0.025013 -0.000253 0.004463 ...
## $ company_sizeSmall : num [1:1964] 1 1 1 0 0 0 0 0 1 1 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeMedium : num [1:1964] 0 0 0 1 1 1 1 1 0 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeLarge : num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ SECTORUnknown : num [1:1964] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORFinancials : num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORInformation_Technology: num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORCommunication_Services: num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORIndustrials : num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORHealth_Care : num [1:1964] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:6] 1 1 1 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 49
## PERMNO returns SHRCD TICKER SECTOR company_size VOL dollar_vol
## <chr> <dbl> <int> <chr> <fct> <fct> <dbl> <dbl>
## 1 10113 0.0534 73 AADR Unknown Small 625314 30943487.
## 2 11182 0.0125 73 GASL Unknown Small 157637284 1211479740.
## 3 11264 0.0903 73 RETL Unknown Small 3030234 71590797.
## 4 11996 0.00653 73 EMLC Unknown Medium 178576800 5947066455.
## 5 12035 0.0651 73 AMLP Unknown Medium 1745883591 14865575916.
## 6 12059 0.00473 73 SCHR Unknown Medium 34034683 1886731858.
## 7 12064 0.00159 73 SCHO Unknown Medium 79027981 3996375262.
## 8 12065 0.00264 73 SCHP Unknown Medium 59689638 3393526567.
## 9 12075 0.0101 73 ELD Unknown Small 2743109 95351320.
## 10 12098 0.0453 73 ECON Unknown Small 2447440 54618394.
## # ℹ 41 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## # volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## # abs_corr_sp <dbl>, security_type <chr>, tracking_error <dbl>,
## # log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## # log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## # log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## # log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …
Correlation of variable
#Find the correlation between retruns and each potential explanatory variables
cor_p1_m3 <- cor(task2_reg_p1_df2%>% select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)), use = "pairwise.complete.obs")
# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p1_m3 <- which(abs(cor_p1_m3) > 0.7, arr.ind = TRUE)
high_corr_pairs_p1_m3 <- data.frame(
Feature1 = rownames(cor_p1_m3)[high_corr_p1_m3[,1]],
Feature2 = colnames(cor_p1_m3)[high_corr_p1_m3[,2]],
Correlation = cor_p1_m3[high_corr_p1_m3]
)
high_corr_pairs_p1_m3 <- high_corr_pairs_p1_m3 %>% filter(Correlation < 1)
# Inspect the variables with multicollinearity
head(high_corr_pairs_p1_m3)## Feature1 Feature2 Correlation
## 1 market_cap dollar_vol 0.7143606
## 2 scaled_market_cap dollar_vol 0.7143606
## 3 log_spread bid_ask_spread 0.7674109
## 4 log_volatility volatility 0.7653878
## 5 dollar_vol market_cap 0.7143606
## 6 scaled_dollar_vol market_cap 0.7143606
## Var1 Var2 value
## 1 returns returns 1.00000000
## 2 VOL returns 0.04421879
## 3 dollar_vol returns 0.03832168
## 4 bid_ask_spread returns 0.02658229
## 5 turnover_ratio returns -0.02357584
## 6 volatility returns 0.16668020
Build Model 3
# Splitting the dataset into training and testing sets
set.seed(847)
# Non-COVID Period
X_p1_m3 <- task2_reg_p1_df2 %>%
select(Mkt_beta, log_dollar_vol, log_tracking_error, log_market_share, log_turnover_ratio, log_volatility)
y_p1_m3 <- task2_reg_p1_df2$returns
splitIndex_p1_m3 <- createDataPartition(y_p1_m3,
p = 0.8,
list = FALSE)
X_train_data_p1_m3 <- X_p1_m3[splitIndex_p1_m3, ]
X_test_data_p1_m3 <- X_p1_m3[-splitIndex_p1_m3, ]
y_train_data_p1_m3 <- y_p1_m3[splitIndex_p1_m3]
y_test_data_p1_m3 <- y_p1_m3[-splitIndex_p1_m3]
train_data_p1_m3 <- cbind(X_train_data_p1_m3, returns = y_train_data_p1_m3)
# Build Model
model_3_p1 <- lm(returns ~ ., data = train_data_p1_m3)
# Prediction for training sample
pred_train_p1_m3 <- predict(model_3_p1, newdata = X_train_data_p1_m3)
metrics_train_p1_m3 <- postResample(pred = pred_train_p1_m3,
obs = train_data_p1_m3$returns)
metrics_table_train_p1_m3 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_train_p1_m3["RMSE"]^2,
metrics_train_p1_m3["RMSE"],
metrics_train_p1_m3["MAE"],
metrics_train_p1_m3["Rsquared"])
)
# Prediction for testing sample
pred_test_p1_m3 <- predict(model_3_p1, newdata = X_test_data_p1_m3)
metrics_test_p1_m3 <- postResample(pred = pred_test_p1_m3,
obs = y_test_data_p1_m3)
metrics_table_test_p1_m3 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_test_p1_m3["RMSE"]^2
, metrics_test_p1_m3["RMSE"], metrics_test_p1_m3["MAE"], metrics_test_p1_m3["Rsquared"])
)## Performance Metrics for the Training Dataset:
## Metric Value
## 1 MSE 0.0005017145
## 2 RMSE 0.0223989841
## 3 MAE 0.0165767300
## 4 R-squared 0.2138294556
## Performance Metrics for the Testing Dataset:
## Metric Value
## 1 MSE 0.0003576866
## 2 RMSE 0.0189126033
## 3 MAE 0.0143617736
## 4 R-squared 0.3400332199
## Regression coefficient of Model 1:
##
## Call:
## lm(formula = returns ~ ., data = train_data_p1_m3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.080371 -0.012175 -0.000294 0.011288 0.072915
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0111798 0.0077640 1.440 0.1501
## Mkt_beta 0.7686952 0.1176976 6.531 8.80e-11 ***
## log_dollar_vol -0.0005336 0.0003067 -1.740 0.0821 .
## log_tracking_error -0.0093686 0.0010685 -8.768 < 2e-16 ***
## log_market_share 0.0012796 0.0002437 5.250 1.73e-07 ***
## log_turnover_ratio 0.0016244 0.0006359 2.555 0.0107 *
## log_volatility 0.0052169 0.0008652 6.030 2.04e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02245 on 1565 degrees of freedom
## Multiple R-squared: 0.2138, Adjusted R-squared: 0.2108
## F-statistic: 70.94 on 6 and 1565 DF, p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
## Mkt_beta log_dollar_vol log_tracking_error log_market_share
## 1.572187 2.157164 1.414408 1.739820
## log_turnover_ratio log_volatility
## 1.628724 1.479572
# Scatterplot of Prediction vs Actual
plot_data_p1_m3 <- data.frame(
Predicted = pred_test_p1_m3,
Actual = y_test_data_p1_m3
)
plot_m3_p1 <- ggplot(data = plot_data_p1_m3,
aes(x = Predicted,
y = Actual)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Evaluating Model 3's Performance: Predicted vs. Actual Returns in the Non-COVID Period",
x = "Predicted returns",
y = "Actual returns")
print(plot_m3_p1)Important variable
# Extract coefficients
coef_m3 <- coef(model_3_p1)
importance_m3 <- data.frame(
Factors = names(coef_m3),
Coefficient = coef_m3
)
importance_m3 <- importance_m3[importance_m3$Factor != "(Intercept)", ]
# Plot with positive and negative impacts
plot_m3_p1_factors <- ggplot(importance_m3, aes(x = reorder(Factors, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Coefficient, 3)),
vjust = ifelse(importance_m3$Coefficient > 0, 0.5, 0.3),
color = "black") +
coord_flip() +
labs(title = "Impact of factors on Returns for Model 3",
x = "Factors",
y = "Coefficient") +
theme_minimal() +
scale_fill_manual(
values = c("TRUE" = "steelblue", "FALSE" = "coral"),
labels = c("TRUE" = "Positive Impact",
"FALSE" = "Negative Impact"),
name = ""
) +
theme(
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
)
print(plot_m3_p1_factors)Group by Sector
# Apply the model particularly by industry
pred_sector_p1_m3 <- cbind(X_test_data_p1_m3, SECTOR = task2_reg_p1_df2$SECTOR[-splitIndex_p1_m3])
data_sector_p1_m3 <- data.frame(
Predicted = pred_test_p1_m3,
Actual = y_test_data_p1_m3,
SECTOR = pred_sector_p1_m3$SECTOR
)
data_sector_p1_m3$SECTOR <- as.character(data_sector_p1_m3$SECTOR)
# Calculate evaluation metrics to see the performance of model apply on SECTOR
metrics_table_sector_p1_m3 <- data_sector_p1_m3 %>%
group_by(SECTOR) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_sector_p1_m3)## # A tibble: 3 × 5
## SECTOR MSE RMSE MAE R_Squared
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Communication_Services 0.000443 0.0210 0.0210 -Inf
## 2 Information_Technology 0.0000188 0.00434 0.00434 -Inf
## 3 Unknown 0.000358 0.0189 0.0144 0.333
# Plot the prediction vs actual by SECTOR
plot_m3_p1_sector <- ggplot(data = data_sector_p1_m3,
aes(x = Predicted,
y = Actual,
color = SECTOR)) +
geom_point(size = 2) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 3's Performance: Predicted vs. Actual Returns",
subtitle = "During the Non-COVID Period (Grouped by Industry Sector)",
x = "Predicted returns",
y = "Actual returns",
color = "Industry Sector")
print(plot_m3_p1_sector)Group by Company size
# Apply the model particularly by company size
pred_size_p1_m3 <- cbind(X_test_data_p1_m3, company_size = task2_reg_p1_df2$company_size[-splitIndex_p1_m3])
data_size_p1_m3 <- data.frame(
Predicted = pred_test_p1_m3,
Actual = y_test_data_p1_m3,
company_size = pred_size_p1_m3$company_size
)
# Calculate evaluation metrics to see the performance of model apply on company size
metrics_table_size_p1_m3 <- data_size_p1_m3 %>%
group_by(company_size) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_size_p1_m3)## # A tibble: 3 × 5
## company_size MSE RMSE MAE R_Squared
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Small 0.000365 0.0191 0.0146 0.324
## 2 Medium 0.000374 0.0193 0.0142 0.303
## 3 Large 0.000189 0.0138 0.00937 0.495
# Plot the prediction vs actual by company size
plot_m3_p1_size <- ggplot(data = data_size_p1_m3,
aes(x = Predicted,
y = Actual,
color = company_size)) +
geom_point(size = 2, alpha = 0.7) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 3's Performance: Predicted vs. Actual Returns",
subtitle = "During the Non-COVID Period (Grouped by Company Size)",
x = "Predicted returns",
y = "Actual returns",
color = "Company Size")
print(plot_m3_p1_size)Model 4: (ETF) Returns: 2020-02-14 to 2020-03-20 & X variables: before 2019-02-14
# Period 2 - Dataset 2
task2_p2_df2_1 <- transform_dataset_for_task2(task2, "2020-02-14", "2020-03-20", "2019-08-20", "2020-02-14", "ETFs", FFF_beta_covid)## Highest Date: [1] "2020-02-14"
##
## Lowest Date: [1] "2020-03-18"
##
## Number of PERMNO in the dataset that calculated returns: 5764
## Number of PERMNO that are in the dataset that aggregate x variables: 2223
## Number of PERMNO that are in the both datasets: 2168
## Number of PERMNO that are in the final dataset: 2127
# Remove outlier for returns
task2_p2_df2_1 <- remove_outliers_all(task2_p2_df2_1, "returns")
# Inspect the distribution of Returns
print(
ggplot(task2_p2_df2_1, aes(x = returns)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)## Sample size after remove outlier: 1966
# Create dummy variable for categorical variable
task2_reg_p2_df2 <- task2_p2_df2_1 %>%
filter(!is.na(volatility)) %>%
mutate(company_size = factor(company_size, levels = unique(task2_p2_df2_1$company_size)),
SECTOR = factor(SECTOR, levels = unique(task2_p2_df2_1$SECTOR))) %>%
bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .)))
# Inspect any missing value in important variables
colSums(is.na(task2_reg_p2_df2))## PERMNO returns
## 0 0
## SHRCD TICKER
## 0 0
## SECTOR company_size
## 0 0
## VOL dollar_vol
## 0 0
## bid_ask_spread turnover_ratio
## 0 0
## volatility sharpe_ratio
## 0 0
## market_share market_cap
## 0 0
## abs_corr_sp security_type
## 0 0
## tracking_error log_vol
## 0 0
## log_dollar_vol log_spread
## 0 0
## log_turnover_ratio log_volatility
## 0 0
## log_market_share log_market_cap
## 0 0
## log_tracking_error vol_cap_ratio
## 0 0
## log_vol_cap_ratio
## 0 0
##
## 0 0
##
## 0 0
##
## 0 0
##
## 0 0
## Mkt_beta
## 0 0
## SMB_beta HML_beta
## 0 0
## company_sizeSmall company_sizeMedium
## 0 0
## company_sizeLarge SECTORUnknown
## 0 0
## SECTORFinancials SECTORInformation_Technology
## 0 0
## SECTORHealth_Care
## 0
## tibble [1,947 × 47] (S3: tbl_df/tbl/data.frame)
## $ PERMNO : chr [1:1947] "10113" "11407" "11996" "12054" ...
## $ returns : num [1:1947] 0.6411 0.656 0.2445 0.6555 -0.0236 ...
## $ SHRCD : int [1:1947] 73 73 73 73 73 73 73 73 73 73 ...
## $ TICKER : chr [1:1947] "AADR" "LIT" "EMLC" "SCIF" ...
## $ SECTOR : Factor w/ 4 levels "Unknown","Financials",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ company_size : Factor w/ 3 levels "Small","Medium",..: 1 1 2 1 2 2 2 1 1 1 ...
## $ VOL : num [1:1947] 1.38e+06 2.18e+07 2.75e+08 5.28e+06 5.36e+07 ...
## $ dollar_vol : num [1:1947] 7.21e+07 6.04e+08 9.19e+09 1.73e+08 2.97e+09 ...
## $ bid_ask_spread : num [1:1947] 0.004037 0.001298 0.000419 0.003274 0.000287 ...
## $ turnover_ratio : num [1:1947] 0.00535 0.0095 0.01524 0.01108 0.00496 ...
## $ volatility : num [1:1947] 0.1065 0.1914 0.0576 0.1852 0.0368 ...
## $ sharpe_ratio : num [1:1947] -237 -137 -442 -140 -667 ...
## $ market_share : num [1:1947] 0.0011 4.8669 0.1553 0.1261 0.0696 ...
## $ market_cap : num [1:1947] 1.34e+10 6.01e+10 6.04e+11 1.56e+10 5.98e+11 ...
## $ abs_corr_sp : num [1:1947] 0.834 0.82 0.638 0.725 0.537 ...
## $ security_type : chr [1:1947] "ETF" "ETF" "ETF" "ETF" ...
## $ tracking_error : num [1:1947] 0.012 0.0157 0.0165 0.0179 0.0226 ...
## $ log_vol : num [1:1947] 14.1 16.9 19.4 15.5 17.8 ...
## $ log_dollar_vol : num [1:1947] 18.1 20.2 22.9 19 21.8 ...
## $ log_spread : num [1:1947] -5.51 -6.65 -7.77 -5.72 -8.15 ...
## $ log_turnover_ratio : num [1:1947] -5.23 -4.66 -4.18 -4.5 -5.31 ...
## $ log_volatility : num [1:1947] -2.24 -1.65 -2.85 -1.69 -3.3 ...
## $ log_market_share : num [1:1947] -6.81 1.58 -1.86 -2.07 -2.66 ...
## $ log_market_cap : num [1:1947] 23.3 24.8 27.1 23.5 27.1 ...
## $ log_tracking_error : num [1:1947] -4.42 -4.15 -4.1 -4.03 -3.79 ...
## $ vol_cap_ratio : num [1:1947] 1.03e-04 3.62e-04 4.55e-04 3.38e-04 8.97e-05 ...
## $ log_vol_cap_ratio : num [1:1947] -9.18 -7.92 -7.7 -7.99 -9.32 ...
## $ scaled_vol : num [1:1947, 1] -0.1716 -0.1184 0.541 -0.1615 -0.0353 ...
## ..- attr(*, "scaled:center")= num 67184777
## ..- attr(*, "scaled:scale")= num 3.83e+08
## $ scaled_dollar_vol : num [1:1947, 1] -0.0835 -0.0738 0.0828 -0.0816 -0.0306 ...
## ..- attr(*, "scaled:center")= num 4.65e+09
## ..- attr(*, "scaled:scale")= num 5.48e+10
## $ scaled_spread : num [1:1947, 1] 0.726 -0.324 -0.661 0.434 -0.711 ...
## ..- attr(*, "scaled:center")= num 0.00214
## ..- attr(*, "scaled:scale")= num 0.00261
## $ scaled_turnover_ratio : num [1:1947, 1] -0.2626 -0.1877 -0.0841 -0.1593 -0.2695 ...
## ..- attr(*, "scaled:center")= num 0.0199
## ..- attr(*, "scaled:scale")= num 0.0554
## $ scaled_volatility : num [1:1947, 1] -0.144 0.641 -0.596 0.584 -0.789 ...
## ..- attr(*, "scaled:center")= num 0.122
## ..- attr(*, "scaled:scale")= num 0.108
## $ scaled_sharpe_ratio : num [1:1947, 1] 0.2254 0.3206 0.0325 0.3179 -0.1805 ...
## ..- attr(*, "scaled:center")= num -476
## ..- attr(*, "scaled:scale")= num 1059
## $ scaled_market_share : num [1:1947, 1] -0.178 2.55 -0.092 -0.108 -0.14 ...
## ..- attr(*, "scaled:center")= num 0.319
## ..- attr(*, "scaled:scale")= num 1.78
## $ scaled_market_cap : num [1:1947, 1] -0.176 -0.139 0.293 -0.174 0.288 ...
## ..- attr(*, "scaled:center")= num 2.35e+11
## ..- attr(*, "scaled:scale")= num 1.26e+12
## $ scaled_abs_corr_sp : num [1:1947, 1] 0.3314 0.28 -0.4246 -0.0899 -0.8161 ...
## ..- attr(*, "scaled:center")= num 0.748
## ..- attr(*, "scaled:scale")= num 0.258
## $ scaled_tracking_error : num [1:1947, 1] -0.29226 -0.05883 -0.00565 0.07711 0.37667 ...
## ..- attr(*, "scaled:center")= num 0.0166
## ..- attr(*, "scaled:scale")= num 0.0157
## $ Mkt_beta : num [1:1947] 0.00699 0.01214 0.00149 0.00373 -0.00174 ...
## $ SMB_beta : num [1:1947] -0.001252 0.003313 -0.000873 0.001297 -0.000373 ...
## $ HML_beta : num [1:1947] -4.80e-03 2.80e-04 -4.41e-05 2.13e-03 -1.56e-03 ...
## $ company_sizeSmall : num [1:1947] 1 1 0 1 0 0 0 1 1 1 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeMedium : num [1:1947] 0 0 1 0 1 1 1 0 0 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeLarge : num [1:1947] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ SECTORUnknown : num [1:1947] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "assign")= int [1:4] 1 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORFinancials : num [1:1947] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:4] 1 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORInformation_Technology: num [1:1947] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:4] 1 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORHealth_Care : num [1:1947] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:4] 1 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 47
## PERMNO returns SHRCD TICKER SECTOR company_size VOL dollar_vol
## <chr> <dbl> <int> <chr> <fct> <fct> <dbl> <dbl>
## 1 10113 0.641 73 AADR Unknown Small 1377219 72137254.
## 2 11407 0.656 73 LIT Unknown Small 21779548 603877429.
## 3 11996 0.244 73 EMLC Unknown Medium 274606166 9192444608.
## 4 12054 0.656 73 SCIF Unknown Small 5283845 173272234.
## 5 12059 -0.0236 73 SCHR Unknown Medium 53634151 2970639004.
## 6 12064 -0.0158 73 SCHO Unknown Medium 120765351 6106187629.
## 7 12065 0.0853 73 SCHP Unknown Medium 87431310 4977006530.
## 8 12075 0.298 73 ELD Unknown Small 4382473 153885205.
## 9 12098 0.341 73 ECON Unknown Small 3895807 88874301.
## 10 12105 0.402 73 ENZL Unknown Small 5536801 300116929.
## # ℹ 39 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## # volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## # abs_corr_sp <dbl>, security_type <chr>, tracking_error <dbl>,
## # log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## # log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## # log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## # log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …
Correlation of variable
#Find the correlation between retruns and each potential explanatory variables
cor_p2_m4 <- cor(task2_reg_p2_df2%>% select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)), use = "pairwise.complete.obs")
# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p2_m4 <- which(abs(cor_p2_m4) > 0.7, arr.ind = TRUE)
high_corr_pairs_p2_m4 <- data.frame(
Feature1 = rownames(cor_p2_m4)[high_corr_p2_m4[,1]],
Feature2 = colnames(cor_p2_m4)[high_corr_p2_m4[,2]],
Correlation = cor_p2_m4[high_corr_p2_m4]
)
high_corr_pairs_p2_m4 <- high_corr_pairs_p2_m4 %>% filter(Correlation < 1)
# Inspect the variables with multicollinearity
head(high_corr_pairs_p2_m4)## Feature1 Feature2 Correlation
## 1 market_cap dollar_vol 0.7210814
## 2 scaled_market_cap dollar_vol 0.7210814
## 3 log_spread bid_ask_spread 0.7265476
## 4 vol_cap_ratio turnover_ratio 0.7700479
## 5 log_volatility volatility 0.8074679
## 6 log_volatility sharpe_ratio 0.7121814
## Var1 Var2 value
## 1 returns returns 1.000000000
## 2 VOL returns -0.003687189
## 3 dollar_vol returns -0.005025484
## 4 bid_ask_spread returns 0.025407662
## 5 turnover_ratio returns -0.019845336
## 6 volatility returns 0.418324811
Build Model 4
# Splitting the dataset into training and testing sets
set.seed(847)
# COVID Period
X_p2_m4 <- task2_reg_p2_df2 %>%
select(Mkt_beta, log_dollar_vol, log_tracking_error, log_turnover_ratio, log_volatility, scaled_abs_corr_sp)
y_p2_m4 <- task2_reg_p2_df2$returns
splitIndex_p2_m4 <- createDataPartition(y_p2_m4,
p = 0.8,
list = FALSE)
X_train_data_p2_m4 <- X_p2_m4[splitIndex_p2_m4, ]
X_test_data_p2_m4 <- X_p2_m4[-splitIndex_p2_m4, ]
y_train_data_p2_m4 <- y_p2_m4[splitIndex_p2_m4]
y_test_data_p2_m4 <- y_p2_m4[-splitIndex_p2_m4]
train_data_p2_m4 <- cbind(X_train_data_p2_m4, returns = y_train_data_p2_m4)
# Build Model
model_4_p2 <- lm(returns ~ ., data = train_data_p2_m4)
# Prediction for training sample
pred_train_p2_m4 <- predict(model_4_p2, newdata = X_train_data_p2_m4)
metrics_train_p2_m4 <- postResample(pred = pred_train_p2_m4,
obs = train_data_p2_m4$returns)
metrics_table_train_p2_m4 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_train_p2_m4["RMSE"]^2,
metrics_train_p2_m4["RMSE"],
metrics_train_p2_m4["MAE"],
metrics_train_p2_m4["Rsquared"])
)
# Prediction for testing sample
pred_test_p2_m4 <- predict(model_4_p2, newdata = X_test_data_p2_m4)
metrics_test_p2_m4 <- postResample(pred = pred_test_p2_m4,
obs = y_test_data_p2_m4)
metrics_table_test_p2_m4 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_test_p2_m4["RMSE"]^2
, metrics_test_p2_m4["RMSE"], metrics_test_p2_m4["MAE"], metrics_test_p2_m4["Rsquared"])
)
print("Performance Metrics for the Training Dataset:")## [1] "Performance Metrics for the Training Dataset:"
## Metric Value
## 1 MSE 0.03139148
## 2 RMSE 0.17717642
## 3 MAE 0.12423672
## 4 R-squared 0.50715611
## [1] "Performance Metrics for the Testing Dataset:"
## Metric Value
## 1 MSE 0.02831917
## 2 RMSE 0.16828299
## 3 MAE 0.12242571
## 4 R-squared 0.55829343
##
## Call:
## lm(formula = returns ~ ., data = train_data_p2_m4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.97116 -0.08898 -0.00791 0.08081 0.88828
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.630172 0.059686 10.558 < 2e-16 ***
## Mkt_beta 19.898647 1.114378 17.856 < 2e-16 ***
## log_dollar_vol 0.006210 0.001951 3.183 0.00149 **
## log_tracking_error 0.102500 0.012439 8.240 3.62e-16 ***
## log_turnover_ratio -0.033037 0.005078 -6.506 1.04e-10 ***
## log_volatility 0.067368 0.007573 8.896 < 2e-16 ***
## scaled_abs_corr_sp 0.078958 0.007518 10.503 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1776 on 1552 degrees of freedom
## Multiple R-squared: 0.5072, Adjusted R-squared: 0.5053
## F-statistic: 266.2 on 6 and 1552 DF, p-value: < 2.2e-16
## Mkt_beta log_dollar_vol log_tracking_error log_turnover_ratio
## 1.901622 1.272908 2.537112 1.398859
## log_volatility scaled_abs_corr_sp
## 1.798556 2.896959
## Performance Metrics for the Training Dataset:
## Metric Value
## 1 MSE 0.03139148
## 2 RMSE 0.17717642
## 3 MAE 0.12423672
## 4 R-squared 0.50715611
## Performance Metrics for the Testing Dataset:
## Metric Value
## 1 MSE 0.02831917
## 2 RMSE 0.16828299
## 3 MAE 0.12242571
## 4 R-squared 0.55829343
## Regression coefficient of Model 1:
##
## Call:
## lm(formula = returns ~ ., data = train_data_p2_m4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.97116 -0.08898 -0.00791 0.08081 0.88828
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.630172 0.059686 10.558 < 2e-16 ***
## Mkt_beta 19.898647 1.114378 17.856 < 2e-16 ***
## log_dollar_vol 0.006210 0.001951 3.183 0.00149 **
## log_tracking_error 0.102500 0.012439 8.240 3.62e-16 ***
## log_turnover_ratio -0.033037 0.005078 -6.506 1.04e-10 ***
## log_volatility 0.067368 0.007573 8.896 < 2e-16 ***
## scaled_abs_corr_sp 0.078958 0.007518 10.503 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1776 on 1552 degrees of freedom
## Multiple R-squared: 0.5072, Adjusted R-squared: 0.5053
## F-statistic: 266.2 on 6 and 1552 DF, p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
## Mkt_beta log_dollar_vol log_tracking_error log_turnover_ratio
## 1.901622 1.272908 2.537112 1.398859
## log_volatility scaled_abs_corr_sp
## 1.798556 2.896959
# Scatterplot of Prediction vs Actual
plot_data_p2_m4 <- data.frame(
Predicted = pred_test_p2_m4,
Actual = y_test_data_p2_m4
)
plot_m4_p2 <- ggplot(data = plot_data_p2_m4,
aes(x = Predicted,
y = Actual)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Evaluating Model 4's Performance: Predicted vs. Actual Returns in the COVID Period",
x = "Predicted returns",
y = "Actual returns")
print(plot_m4_p2)Important variable
# Extract coefficients
coef_m4 <- coef(model_4_p2)
importance_m4 <- data.frame(
Factors = names(coef_m4),
Coefficient = coef_m4
)
importance_m4 <- importance_m4[importance_m4$Factor != "(Intercept)", ]
# Plot with positive and negative impacts
plot_m4_p2_factors <- ggplot(importance_m4, aes(x = reorder(Factors, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Coefficient, 3)),
vjust = ifelse(importance_m4$Coefficient > 0, 0.5, 0.3),
color = "black") +
coord_flip() +
labs(title = "Impact of factors on Returns for Model 4",
x = "Factors",
y = "Coefficient") +
theme_minimal() +
scale_fill_manual(
values = c("TRUE" = "steelblue", "FALSE" = "coral"),
labels = c("TRUE" = "Positive Impact",
"FALSE" = "Negative Impact"),
name = ""
) +
theme(
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
)
print(plot_m4_p2_factors)Group by Sector
# Apply the model particularly by industry
pred_sector_p2_m4 <- cbind(X_test_data_p2_m4, SECTOR = task2_reg_p2_df2$SECTOR[-splitIndex_p2_m4])
data_sector_p2_m4 <- data.frame(
Predicted = pred_test_p2_m4,
Actual = y_test_data_p2_m4,
SECTOR = pred_sector_p2_m4$SECTOR
)
data_sector_p2_m4$SECTOR <- as.character(data_sector_p2_m4$SECTOR)
# Calculate evaluation metrics to see the performance of model apply on SECTOR
metrics_table_sector_p2_m4 <- data_sector_p2_m4 %>%
group_by(SECTOR) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_sector_p2_m4)## # A tibble: 1 × 5
## SECTOR MSE RMSE MAE R_Squared
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Unknown 0.0283 0.168 0.122 0.555
# Plot the prediction vs actual by SECTOR
plot_m4_p2_sector <- ggplot(data = data_sector_p2_m4,
aes(x = Predicted,
y = Actual,
color = SECTOR)) +
geom_point(size = 2) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 4's Performance: Predicted vs. Actual Returns",
subtitle = "During the COVID Period (Grouped by Industry Sector)",
x = "Predicted returns",
y = "Actual returns",
color = "Industry Sector")
print(plot_m4_p2_sector)Group by Company size
# Apply the model particularly by company size
pred_size_p2_m4 <- cbind(X_test_data_p2_m4, company_size = task2_reg_p2_df2$company_size[-splitIndex_p2_m4])
data_size_p2_m4 <- data.frame(
Predicted = pred_test_p2_m4,
Actual = y_test_data_p2_m4,
company_size = pred_size_p2_m4$company_size
)
# Calculate evaluation metrics to see the performance of model apply on company size
metrics_table_size_p2_m4 <- data_size_p2_m4 %>%
group_by(company_size) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_size_p2_m4)## # A tibble: 3 × 5
## company_size MSE RMSE MAE R_Squared
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Small 0.0297 0.172 0.124 0.557
## 2 Medium 0.0245 0.157 0.127 0.389
## 3 Large 0.0145 0.120 0.0991 0.652
# Plot the prediction vs actual by company size
plot_m4_p2_size <- ggplot(data = data_size_p2_m4,
aes(x = Predicted,
y = Actual,
color = company_size)) +
geom_point(size = 2, alpha = 0.7) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 4's Performance: Predicted vs. Actual Returns",
subtitle = "During the COVID Period (Grouped by Company Size)",
x = "Predicted returns",
y = "Actual returns",
color = "Company Size")
print(plot_m4_p2_size)Model 5: (Stocks) Returns: 2019-12-14 to 2020-01-20 & X variables: before 2019-02-14
# Period 1 - Dataset 3
task2_p1_df3_1 <- transform_dataset_for_task2(task2, "2019-12-14", "2020-01-20", "2019-08-20", "2019-12-14", "stocks", FFF_beta_noncovid) ## Highest Date: [1] "2020-01-17"
##
## Lowest Date: [1] "2019-12-16"
##
## Number of PERMNO in the dataset that calculated returns: 5790
## Number of PERMNO that are in the dataset that aggregate x variables: 3660
## Number of PERMNO that are in the both datasets: 3598
## Number of PERMNO that are in the final dataset: 3553
# Remove outlier for returns
task2_p1_df3_1 <- remove_outliers_all(task2_p1_df3_1, "returns")
# Inspect the distribution of Returns
print(
ggplot(task2_p1_df3_1, aes(x = returns)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)## Sample size after remove outlier: 3173
# Create dummy variable for categorical variable
task2_reg_p1_df3 <- task2_p1_df3_1 %>%
filter(!is.na(volatility)) %>%
mutate(company_size = factor(company_size, levels = unique(task2_p1_df3_1$company_size)),
SECTOR = factor(SECTOR, levels = unique(task2_p1_df3_1$SECTOR))) %>%
bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .))) ## tibble [3,157 × 55] (S3: tbl_df/tbl/data.frame)
## $ PERMNO : chr [1:3157] "10026" "10028" "10032" "10044" ...
## $ returns : num [1:3157] 0.01925 0.1259 0.01087 -0.00613 -0.00417 ...
## $ SHRCD : int [1:3157] 11 11 11 11 11 11 11 11 11 11 ...
## $ TICKER : chr [1:3157] "JJSF" "DGSE" "PLXS" "RMCF" ...
## $ SECTOR : Factor w/ 12 levels "Consumer_Staples",..: 1 2 3 1 2 3 3 4 5 5 ...
## $ company_size : Factor w/ 3 levels "Medium","Small",..: 1 2 2 2 2 3 3 3 3 2 ...
## $ VOL : num [1:3157] 7586162 4650390 13170524 658814 32686198 ...
## $ dollar_vol : num [1:3157] 1.44e+09 6.27e+06 9.09e+08 5.93e+06 7.24e+08 ...
## $ bid_ask_spread : num [1:3157] 0.00109 0.015923 0.000649 0.011499 0.000773 ...
## $ turnover_ratio : num [1:3157] 0.0049 0.00211 0.00551 0.00134 0.01069 ...
## $ volatility : num [1:3157] 0.176 0.658 0.28 0.153 0.364 ...
## $ sharpe_ratio : num [1:3157] -143.4 -38.2 -91 -161.8 -68.9 ...
## $ market_share : num [1:3157] 0.0286 0 0.0252 0.2334 0.0562 ...
## $ market_cap : num [1:3157] 2.94e+11 2.76e+09 1.62e+11 4.45e+09 6.75e+10 ...
## $ abs_corr_sp : num [1:3157] 0.72 0.318 0.635 0.193 0.595 ...
## $ security_type : chr [1:3157] "Stock" "Stock" "Stock" "Stock" ...
## $ tracking_error : num [1:3157] 0.0225 0.0523 0.0261 0.0403 0.0324 ...
## $ log_vol : num [1:3157] 15.8 15.4 16.4 13.4 17.3 ...
## $ log_dollar_vol : num [1:3157] 21.1 15.7 20.6 15.6 20.4 ...
## $ log_spread : num [1:3157] -6.82 -4.14 -7.34 -4.47 -7.16 ...
## $ log_turnover_ratio : num [1:3157] -5.32 -6.16 -5.2 -6.61 -4.54 ...
## $ log_volatility : num [1:3157] -1.736 -0.418 -1.274 -1.878 -1.011 ...
## $ log_market_share : num [1:3157] -3.56 -13.82 -3.68 -1.46 -2.88 ...
## $ log_market_cap : num [1:3157] 26.4 21.7 25.8 22.2 24.9 ...
## $ log_tracking_error : num [1:3157] -3.79 -2.95 -3.65 -3.21 -3.43 ...
## $ vol_cap_ratio : num [1:3157] 2.58e-05 1.69e-03 8.15e-05 1.48e-04 4.84e-04 ...
## $ log_vol_cap_ratio : num [1:3157] -10.56 -6.38 -9.41 -8.82 -7.63 ...
## $ scaled_vol : num [1:3157, 1] -0.325 -0.337 -0.302 -0.353 -0.221 ...
## ..- attr(*, "scaled:center")= num 86487161
## ..- attr(*, "scaled:scale")= num 2.43e+08
## $ scaled_dollar_vol : num [1:3157, 1] -0.162 -0.247 -0.194 -0.247 -0.205 ...
## ..- attr(*, "scaled:center")= num 4.18e+09
## ..- attr(*, "scaled:scale")= num 1.69e+10
## $ scaled_spread : num [1:3157, 1] -0.442 0.617 -0.473 0.301 -0.465 ...
## ..- attr(*, "scaled:center")= num 0.00728
## ..- attr(*, "scaled:scale")= num 0.014
## $ scaled_turnover_ratio : num [1:3157, 1] -0.0337 -0.0406 -0.0322 -0.0425 -0.0193 ...
## ..- attr(*, "scaled:center")= num 0.0185
## ..- attr(*, "scaled:scale")= num 0.403
## $ scaled_volatility : num [1:3157, 1] -0.717 0.437 -0.469 -0.773 -0.268 ...
## ..- attr(*, "scaled:center")= num 0.476
## ..- attr(*, "scaled:scale")= num 0.418
## $ scaled_sharpe_ratio : num [1:3157, 1] -0.4259 0.4523 0.0118 -0.5794 0.1957 ...
## ..- attr(*, "scaled:center")= num -92.4
## ..- attr(*, "scaled:scale")= num 120
## $ scaled_market_share : num [1:3157, 1] -0.174 -0.177 -0.174 -0.15 -0.171 ...
## ..- attr(*, "scaled:center")= num 1.51
## ..- attr(*, "scaled:scale")= num 8.49
## $ scaled_market_cap : num [1:3157, 1] -0.111 -0.199 -0.151 -0.198 -0.179 ...
## ..- attr(*, "scaled:center")= num 6.64e+11
## ..- attr(*, "scaled:scale")= num 3.32e+12
## $ scaled_abs_corr_sp : num [1:3157, 1] 1.047 -0.766 0.663 -1.333 0.481 ...
## ..- attr(*, "scaled:center")= num 0.488
## ..- attr(*, "scaled:scale")= num 0.221
## $ scaled_tracking_error : num [1:3157, 1] -0.649 0.181 -0.55 -0.153 -0.374 ...
## ..- attr(*, "scaled:center")= num 0.0458
## ..- attr(*, "scaled:scale")= num 0.0359
## $ Mkt_beta : num [1:3157] 0.00362 0.01878 0.01514 0.00029 0.00657 ...
## $ SMB_beta : num [1:3157] 0.001857 0.003485 0.005584 -0.000635 0.013586 ...
## $ HML_beta : num [1:3157] -0.00244 0.00175 -0.00202 -0.002 0.00093 ...
## $ company_sizeMedium : num [1:3157] 1 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeSmall : num [1:3157] 0 1 1 1 1 0 0 0 0 1 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeLarge : num [1:3157] 0 0 0 0 0 1 1 1 1 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ SECTORConsumer_Staples : num [1:3157] 1 0 0 1 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORUnknown : num [1:3157] 0 1 0 0 1 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORInformation_Technology: num [1:3157] 0 0 1 0 0 1 1 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORFinancials : num [1:3157] 0 0 0 0 0 0 0 1 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORIndustrials : num [1:3157] 0 0 0 0 0 0 0 0 1 1 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORHealth_Care : num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORConsumer_Discretionary: num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORMaterials : num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORReal_Estate : num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORUtilities : num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTOREnergy : num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORCommunication_Services: num [1:3157] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 55
## PERMNO returns SHRCD TICKER SECTOR company_size VOL dollar_vol
## <chr> <dbl> <int> <chr> <fct> <fct> <dbl> <dbl>
## 1 10026 0.0192 11 JJSF Consumer_Staples Medium 7.59e6 1.44e 9
## 2 10028 0.126 11 DGSE Unknown Small 4.65e6 6.27e 6
## 3 10032 0.0109 11 PLXS Information_Tech… Small 1.32e7 9.09e 8
## 4 10044 -0.00613 11 RMCF Consumer_Staples Small 6.59e5 5.93e 6
## 5 10051 -0.00417 11 HNGR Unknown Small 3.27e7 7.24e 8
## 6 10104 0.0211 11 ORCL Information_Tech… Large 9.55e8 5.21e10
## 7 10107 0.0744 11 MSFT Information_Tech… Large 1.84e9 2.61e11
## 8 10138 0.0732 11 TROW Financials Large 7.70e7 8.87e 9
## 9 10145 0.0387 11 HON Industrials Large 2.26e8 3.85e10
## 10 10158 0.205 11 AMRC Industrials Small 9.85e6 1.52e 8
## # ℹ 47 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## # volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## # abs_corr_sp <dbl>, security_type <chr>, tracking_error <dbl>,
## # log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## # log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## # log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## # log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …
Correlation of variable
#Find the correlation between retruns and each potential explanatory variables
cor_p1_m5 <- cor(task2_reg_p1_df3%>% select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)), use = "pairwise.complete.obs")
# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p1_m5 <- which(abs(cor_p1_m5) > 0.7, arr.ind = TRUE)
high_corr_pairs_p1_m5 <- data.frame(
Feature1 = rownames(cor_p1_m5)[high_corr_p1_m5[,1]],
Feature2 = colnames(cor_p1_m5)[high_corr_p1_m5[,2]],
Correlation = cor_p1_m5[high_corr_p1_m5]
)
high_corr_pairs_p1_m5 <- high_corr_pairs_p1_m5 %>% filter(Correlation < 1)
# Inspect the variables with multicollinearity
head(high_corr_pairs_p1_m5)## Feature1 Feature2 Correlation
## 1 market_cap dollar_vol 0.8922280
## 2 scaled_market_cap dollar_vol 0.8922280
## 3 log_spread bid_ask_spread 0.7390434
## 4 log_volatility volatility 0.7927991
## 5 log_volatility sharpe_ratio 0.7391650
## 6 dollar_vol market_cap 0.8922280
## Var1 Var2 value
## 1 returns returns 1.000000000
## 2 VOL returns 0.009516856
## 3 dollar_vol returns 0.061734657
## 4 bid_ask_spread returns -0.023646635
## 5 turnover_ratio returns 0.041818227
## 6 volatility returns 0.064446949
Build Model 5
# Splitting the dataset into training and testing sets
set.seed(847)
# Non-COVID Period
X_p1_m5 <- task2_reg_p1_df3 %>%
select(Mkt_beta, SMB_beta, HML_beta, log_turnover_ratio, log_volatility, vol_cap_ratio, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown)
y_p1_m5 <- task2_reg_p1_df3$returns
splitIndex_p1_m5 <- createDataPartition(y_p1_m5,
p = 0.8,
list = FALSE)
X_train_data_p1_m5 <- X_p1_m5[splitIndex_p1_m5, ]
X_test_data_p1_m5 <- X_p1_m5[-splitIndex_p1_m5, ]
y_train_data_p1_m5 <- y_p1_m5[splitIndex_p1_m5]
y_test_data_p1_m5 <- y_p1_m5[-splitIndex_p1_m5]
train_data_p1_m5 <- cbind(X_train_data_p1_m5, returns = y_train_data_p1_m5)
# Build Model
model_5_p1 <- lm(returns ~ ., data = train_data_p1_m5)
# Prediction for training sample
pred_train_p1_m5 <- predict(model_5_p1, newdata = X_train_data_p1_m5)
metrics_train_p1_m5 <- postResample(pred = pred_train_p1_m5,
obs = train_data_p1_m5$returns)
metrics_table_train_p1_m5 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_train_p1_m5["RMSE"]^2,
metrics_train_p1_m5["RMSE"],
metrics_train_p1_m5["MAE"],
metrics_train_p1_m5["Rsquared"])
)
# Prediction for testing sample
pred_test_p1_m5 <- predict(model_5_p1, newdata = X_test_data_p1_m5)
metrics_test_p1_m5 <- postResample(pred = pred_test_p1_m5,
obs = y_test_data_p1_m5)
metrics_table_test_p1_m5 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_test_p1_m5["RMSE"]^2
, metrics_test_p1_m5["RMSE"], metrics_test_p1_m5["MAE"], metrics_test_p1_m5["Rsquared"])
)## Performance Metrics for the Training Dataset:
## Metric Value
## 1 MSE 0.005882033
## 2 RMSE 0.076694414
## 3 MAE 0.057478969
## 4 R-squared 0.089301378
## Performance Metrics for the Testing Dataset:
## Metric Value
## 1 MSE 0.006180838
## 2 RMSE 0.078618304
## 3 MAE 0.057724385
## 4 R-squared 0.054573909
## Regression coefficient of Model 1:
##
## Call:
## lm(formula = returns ~ ., data = train_data_p1_m5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.23497 -0.04214 -0.00136 0.04151 0.25555
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0829358 0.0124833 6.644 3.74e-11 ***
## Mkt_beta -0.3487390 0.2313241 -1.508 0.13179
## SMB_beta -0.7431799 0.1642898 -4.524 6.36e-06 ***
## HML_beta -0.9928276 0.1839652 -5.397 7.42e-08 ***
## log_turnover_ratio 0.0072711 0.0015790 4.605 4.33e-06 ***
## log_volatility 0.0114468 0.0029190 3.921 9.04e-05 ***
## vol_cap_ratio 0.0900588 0.0688961 1.307 0.19128
## SECTORMaterials -0.0231078 0.0118721 -1.946 0.05172 .
## SECTORIndustrials 0.0084369 0.0090790 0.929 0.35284
## SECTORFinancials -0.0165188 0.0088529 -1.866 0.06217 .
## SECTORInformation_Technology 0.0295801 0.0092036 3.214 0.00133 **
## SECTORConsumer_Discretionary -0.0009514 0.0092070 -0.103 0.91771
## SECTORHealth_Care 0.0168308 0.0090503 1.860 0.06305 .
## SECTOREnergy -0.0083266 0.0119167 -0.699 0.48478
## SECTORUtilities 0.0056029 0.0131535 0.426 0.67017
## SECTORReal_Estate -0.0059908 0.0189206 -0.317 0.75155
## SECTORCommunication_Services 0.0338927 0.0117035 2.896 0.00381 **
## SECTORUnknown 0.0061192 0.0084326 0.726 0.46812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07697 on 2510 degrees of freedom
## Multiple R-squared: 0.0893, Adjusted R-squared: 0.08313
## F-statistic: 14.48 on 17 and 2510 DF, p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
## Mkt_beta SMB_beta
## 1.168363 1.319472
## HML_beta log_turnover_ratio
## 1.213377 1.340978
## log_volatility vol_cap_ratio
## 1.454083 1.147608
## SECTORMaterials SECTORIndustrials
## 1.731422 3.529172
## SECTORFinancials SECTORInformation_Technology
## 4.169097 3.403189
## SECTORConsumer_Discretionary SECTORHealth_Care
## 3.360413 4.017917
## SECTOREnergy SECTORUtilities
## 1.901831 1.543359
## SECTORReal_Estate SECTORCommunication_Services
## 1.199001 1.747773
## SECTORUnknown
## 5.511434
## Despite SectorUnknown has a VIF over 5, considering it is actually the group of data with missing value in Sector, while some other industry categories are important to the model. After trial and error, it is better to keep the industry variable in the model.
# Scatterplot of Prediction vs Actual
plot_data_p1_m5 <- data.frame(
Predicted = pred_test_p1_m5,
Actual = y_test_data_p1_m5
)
plot_m5_p1 <- ggplot(data = plot_data_p1_m5,
aes(x = Predicted,
y = Actual)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Evaluating Model 5's Performance: Predicted vs. Actual Returns in the Non-COVID Period",
x = "Predicted returns",
y = "Actual returns")
print(plot_m5_p1)Important variable
# Extract coefficients
coef_m5 <- coef(model_5_p1)
importance_m5 <- data.frame(
Factors = names(coef_m5),
Coefficient = coef_m5
)
importance_m5 <- importance_m5[importance_m5$Factor != "(Intercept)", ]
# Plot with positive and negative impacts
plot_m5_p1_factors <- ggplot(importance_m5, aes(x = reorder(Factors, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Coefficient, 3)),
vjust = ifelse(importance_m5$Coefficient > 0, 0.5, 0.3),
color = "black") +
coord_flip() +
labs(title = "Impact of factors on Returns for Model 5",
x = "Factors",
y = "Coefficient") +
theme_minimal() +
scale_fill_manual(
values = c("TRUE" = "steelblue", "FALSE" = "coral"),
labels = c("TRUE" = "Positive Impact",
"FALSE" = "Negative Impact"),
name = ""
) +
theme(
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
)
print(plot_m5_p1_factors)Group by Sector
# Apply the model particularly by industry
pred_sector_p1_m5 <- cbind(X_test_data_p1_m5, SECTOR = task2_reg_p1_df3$SECTOR[-splitIndex_p1_m5])
data_sector_p1_m5 <- data.frame(
Predicted = pred_test_p1_m5,
Actual = y_test_data_p1_m5,
SECTOR = pred_sector_p1_m5$SECTOR
)
data_sector_p1_m5$SECTOR <- as.character(data_sector_p1_m5$SECTOR)
# Calculate evaluation metrics to see the performance of model apply on SECTOR
metrics_table_sector_p1_m5 <- data_sector_p1_m5 %>%
group_by(SECTOR) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_sector_p1_m5)## # A tibble: 12 × 5
## SECTOR MSE RMSE MAE R_Squared
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Communication_Services 0.00307 0.0554 0.0491 0.0476
## 2 Consumer_Discretionary 0.00543 0.0737 0.0535 0.0374
## 3 Consumer_Staples 0.00458 0.0677 0.0517 -0.0289
## 4 Energy 0.0139 0.118 0.0995 -0.0464
## 5 Financials 0.00155 0.0394 0.0309 0.142
## 6 Health_Care 0.0103 0.101 0.0792 0.00111
## 7 Industrials 0.00630 0.0794 0.0569 0.0526
## 8 Information_Technology 0.00820 0.0906 0.0717 -0.00333
## 9 Materials 0.00700 0.0837 0.0632 -0.360
## 10 Real_Estate 0.00679 0.0824 0.0638 0.111
## 11 Unknown 0.00653 0.0808 0.0591 0.00328
## 12 Utilities 0.00198 0.0445 0.0303 -0.0203
# Plot the prediction vs actual by SECTOR
plot_m5_p1_sector <- ggplot(data = data_sector_p1_m5,
aes(x = Predicted,
y = Actual,
color = SECTOR)) +
geom_point(size = 2) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 5's Performance: Predicted vs. Actual Returns",
subtitle = "During the Non-COVID Period (Grouped by Industry Sector)",
x = "Predicted returns",
y = "Actual returns",
color = "Industry Sector")
print(plot_m5_p1_sector)Group by Company size
# Apply the model particularly by company size
pred_size_p1_m5 <- cbind(X_test_data_p1_m5, company_size = task2_reg_p1_df3$company_size[-splitIndex_p1_m5])
data_size_p1_m5 <- data.frame(
Predicted = pred_test_p1_m5,
Actual = y_test_data_p1_m5,
company_size = pred_size_p1_m5$company_size
)
# Calculate evaluation metrics to see the performance of model apply on company size
metrics_table_size_p1_m5 <- data_size_p1_m5 %>%
group_by(company_size) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_size_p1_m5)## # A tibble: 3 × 5
## company_size MSE RMSE MAE R_Squared
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Medium 0.00481 0.0694 0.0499 0.0325
## 2 Small 0.00745 0.0863 0.0648 0.0140
## 3 Large 0.00236 0.0486 0.0368 0.131
# Plot the prediction vs actual by company size
plot_m5_p1_size <- ggplot(data = data_size_p1_m5,
aes(x = Predicted,
y = Actual,
color = company_size)) +
geom_point(size = 2, alpha = 0.7) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 5's Performance: Predicted vs. Actual Returns",
subtitle = "During the Non-COVID Period (Grouped by Company Size)",
x = "Predicted returns",
y = "Actual returns",
color = "Company Size")
print(plot_m5_p1_size)Model 6: (Stock) Returns: 2020-02-14 to 2020-03-20 & X variables: before 2019-02-14
# Period 2 - Dataset 3
task2_p2_df3_1 <- transform_dataset_for_task2(task2, "2020-02-14", "2020-03-20", "2019-08-20", "2020-02-14", "stocks", FFF_beta_covid)## Highest Date: [1] "2020-02-14"
##
## Lowest Date: [1] "2020-03-18"
##
## Number of PERMNO in the dataset that calculated returns: 5764
## Number of PERMNO that are in the dataset that aggregate x variables: 3701
## Number of PERMNO that are in the both datasets: 3595
## Number of PERMNO that are in the final dataset: 3581
# Remove outlier for returns
task2_p2_df3_1 <- remove_outliers_all(task2_p2_df3_1, "returns")
# Inspect the distribution of Returns
print(
ggplot(task2_p2_df3_1, aes(x = returns)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(title = "Histogram of Returns", x = "Returns", y = "Frequency")
)## Sample size after remove outlier: 3363
# Create dummy variable for categorical variable
task2_reg_p2_df3 <- task2_p2_df3_1 %>%
filter(!is.na(volatility)) %>%
mutate(company_size = factor(company_size, levels = unique(task2_p2_df3_1$company_size)),
SECTOR = factor(SECTOR, levels = unique(task2_p2_df3_1$SECTOR))) %>%
bind_cols(as_tibble(model.matrix(~ company_size - 1, data = .))) %>%
bind_cols(as_tibble(model.matrix(~ SECTOR - 1, data = .)))
# Inspect any missing value in important variables
colSums(is.na(task2_reg_p2_df3))## PERMNO returns
## 0 0
## SHRCD TICKER
## 0 0
## SECTOR company_size
## 0 0
## VOL dollar_vol
## 0 0
## bid_ask_spread turnover_ratio
## 0 0
## volatility sharpe_ratio
## 0 0
## market_share market_cap
## 0 0
## abs_corr_sp security_type
## 0 0
## tracking_error log_vol
## 0 0
## log_dollar_vol log_spread
## 0 0
## log_turnover_ratio log_volatility
## 0 0
## log_market_share log_market_cap
## 0 0
## log_tracking_error vol_cap_ratio
## 0 0
## log_vol_cap_ratio
## 0 0
##
## 0 0
##
## 0 0
##
## 0 0
##
## 0 0
## Mkt_beta
## 0 0
## SMB_beta HML_beta
## 0 0
## company_sizeMedium company_sizeSmall
## 0 0
## company_sizeLarge SECTORConsumer_Staples
## 0 0
## SECTORUnknown SECTORInformation_Technology
## 0 0
## SECTORFinancials SECTORIndustrials
## 0 0
## SECTORHealth_Care SECTORConsumer_Discretionary
## 0 0
## SECTORMaterials SECTORReal_Estate
## 0 0
## SECTORUtilities SECTOREnergy
## 0 0
## SECTORCommunication_Services
## 0
## # A tibble: 3,335 × 55
## PERMNO returns SHRCD TICKER SECTOR company_size VOL dollar_vol
## <chr> <dbl> <int> <chr> <fct> <fct> <dbl> <dbl>
## 1 10026 0.504 11 JJSF Consumer_Staples Medium 1.19e7 2.20e 9
## 2 10028 0.0106 11 DGSE Unknown Small 9.71e6 1.62e 7
## 3 10032 0.965 11 PLXS Information_Techn… Small 2.09e7 1.50e 9
## 4 10044 0.551 11 RMCF Consumer_Staples Small 1.41e6 1.25e 7
## 5 10051 1.06 11 HNGR Unknown Small 4.25e7 9.85e 8
## 6 10104 0.173 11 ORCL Information_Techn… Large 1.37e9 7.47e10
## 7 10107 0.320 11 MSFT Information_Techn… Large 3.00e9 4.57e11
## 8 10138 0.267 11 TROW Financials Large 1.21e8 1.46e10
## 9 10145 0.512 11 HON Industrials Large 3.35e8 5.77e10
## 10 10158 0.259 11 AMRC Industrials Small 1.69e7 2.88e 8
## # ℹ 3,325 more rows
## # ℹ 47 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## # volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## # abs_corr_sp <dbl>, security_type <chr>, tracking_error <dbl>,
## # log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## # log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## # log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>, …
## tibble [3,335 × 55] (S3: tbl_df/tbl/data.frame)
## $ PERMNO : chr [1:3335] "10026" "10028" "10032" "10044" ...
## $ returns : num [1:3335] 0.5036 0.0106 0.9654 0.5507 1.0622 ...
## $ SHRCD : int [1:3335] 11 11 11 11 11 11 11 11 11 11 ...
## $ TICKER : chr [1:3335] "JJSF" "DGSE" "PLXS" "RMCF" ...
## $ SECTOR : Factor w/ 12 levels "Consumer_Staples",..: 1 2 3 1 2 3 3 4 5 5 ...
## $ company_size : Factor w/ 3 levels "Medium","Small",..: 1 2 2 2 2 3 3 3 3 2 ...
## $ VOL : num [1:3335] 11872557 9713299 20878422 1408090 42541489 ...
## $ dollar_vol : num [1:3335] 2.20e+09 1.62e+07 1.50e+09 1.25e+07 9.85e+08 ...
## $ bid_ask_spread : num [1:3335] 0.001024 0.014651 0.000675 0.011492 0.000678 ...
## $ turnover_ratio : num [1:3335] 0.00507 0.00291 0.00577 0.00189 0.0092 ...
## $ volatility : num [1:3335] 0.195 0.618 0.238 0.15 0.296 ...
## $ sharpe_ratio : num [1:3335] -137.3 -41.7 -109.6 -162.8 -88.8 ...
## $ market_share : num [1:3335] 0.0422 0.024 0.0359 0.3338 0.0776 ...
## $ market_cap : num [1:3335] 4.36e+11 4.72e+09 2.55e+11 6.66e+09 1.09e+11 ...
## $ abs_corr_sp : num [1:3335] 0.72 0.318 0.635 0.193 0.595 ...
## $ security_type : chr [1:3335] "Stock" "Stock" "Stock" "Stock" ...
## $ tracking_error : num [1:3335] 0.0225 0.0523 0.0261 0.0403 0.0324 ...
## $ log_vol : num [1:3335] 16.3 16.1 16.9 14.2 17.6 ...
## $ log_dollar_vol : num [1:3335] 21.5 16.6 21.1 16.3 20.7 ...
## $ log_spread : num [1:3335] -6.88 -4.22 -7.3 -4.47 -7.29 ...
## $ log_turnover_ratio : num [1:3335] -5.28 -5.84 -5.15 -6.27 -4.69 ...
## $ log_volatility : num [1:3335] -1.637 -0.482 -1.434 -1.896 -1.218 ...
## $ log_market_share : num [1:3335] -3.16 -3.73 -3.33 -1.1 -2.56 ...
## $ log_market_cap : num [1:3335] 26.8 22.3 26.3 22.6 25.4 ...
## $ log_tracking_error : num [1:3335] -3.79 -2.95 -3.65 -3.21 -3.43 ...
## $ vol_cap_ratio : num [1:3335] 2.72e-05 2.06e-03 8.18e-05 2.11e-04 3.91e-04 ...
## $ log_vol_cap_ratio : num [1:3335] -10.51 -6.19 -9.41 -8.46 -7.85 ...
## $ scaled_vol : num [1:3335, 1] -0.323 -0.329 -0.299 -0.351 -0.242 ...
## ..- attr(*, "scaled:center")= num 1.34e+08
## ..- attr(*, "scaled:scale")= num 3.77e+08
## $ scaled_dollar_vol : num [1:3335, 1] -0.147 -0.22 -0.171 -0.22 -0.188 ...
## ..- attr(*, "scaled:center")= num 6.65e+09
## ..- attr(*, "scaled:scale")= num 3.02e+10
## $ scaled_spread : num [1:3335, 1] -0.447 0.577 -0.473 0.339 -0.473 ...
## ..- attr(*, "scaled:center")= num 0.00698
## ..- attr(*, "scaled:scale")= num 0.0133
## $ scaled_turnover_ratio : num [1:3335, 1] -0.0442 -0.0523 -0.0415 -0.0561 -0.0287 ...
## ..- attr(*, "scaled:center")= num 0.0168
## ..- attr(*, "scaled:scale")= num 0.266
## $ scaled_volatility : num [1:3335, 1] -0.692 0.369 -0.582 -0.803 -0.438 ...
## ..- attr(*, "scaled:center")= num 0.471
## ..- attr(*, "scaled:scale")= num 0.399
## $ scaled_sharpe_ratio : num [1:3335, 1] -0.3362 0.4691 -0.1031 -0.5506 0.0721 ...
## ..- attr(*, "scaled:center")= num -97.4
## ..- attr(*, "scaled:scale")= num 119
## $ scaled_market_share : num [1:3335, 1] -0.174 -0.176 -0.175 -0.15 -0.171 ...
## ..- attr(*, "scaled:center")= num 2.12
## ..- attr(*, "scaled:scale")= num 12
## $ scaled_market_cap : num [1:3335, 1] -0.112 -0.194 -0.146 -0.194 -0.175 ...
## ..- attr(*, "scaled:center")= num 1.02e+12
## ..- attr(*, "scaled:scale")= num 5.22e+12
## $ scaled_abs_corr_sp : num [1:3335, 1] 1.054 -0.755 0.671 -1.32 0.49 ...
## ..- attr(*, "scaled:center")= num 0.486
## ..- attr(*, "scaled:scale")= num 0.222
## $ scaled_tracking_error : num [1:3335, 1] -0.653 0.178 -0.554 -0.157 -0.378 ...
## ..- attr(*, "scaled:center")= num 0.0459
## ..- attr(*, "scaled:scale")= num 0.0359
## $ Mkt_beta : num [1:3335] 0.0017 0.014261 0.014132 0.000384 0.003971 ...
## $ SMB_beta : num [1:3335] 0.00197 -0.00226 0.00549 0.00192 0.01299 ...
## $ HML_beta : num [1:3335] -0.000521 0.000203 0.000435 -0.002509 0.002974 ...
## $ company_sizeMedium : num [1:3335] 1 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeSmall : num [1:3335] 0 1 1 1 1 0 0 0 0 1 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ company_sizeLarge : num [1:3335] 0 0 0 0 0 1 1 1 1 0 ...
## ..- attr(*, "assign")= int [1:3] 1 1 1
## ..- attr(*, "contrasts")=List of 1
## .. ..$ company_size: chr "contr.treatment"
## $ SECTORConsumer_Staples : num [1:3335] 1 0 0 1 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORUnknown : num [1:3335] 0 1 0 0 1 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORInformation_Technology: num [1:3335] 0 0 1 0 0 1 1 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORFinancials : num [1:3335] 0 0 0 0 0 0 0 1 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORIndustrials : num [1:3335] 0 0 0 0 0 0 0 0 1 1 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORHealth_Care : num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORConsumer_Discretionary: num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORMaterials : num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORReal_Estate : num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORUtilities : num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTOREnergy : num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## $ SECTORCommunication_Services: num [1:3335] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, "assign")= int [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "contrasts")=List of 1
## .. ..$ SECTOR: chr "contr.treatment"
## # A tibble: 10 × 55
## PERMNO returns SHRCD TICKER SECTOR company_size VOL dollar_vol
## <chr> <dbl> <int> <chr> <fct> <fct> <dbl> <dbl>
## 1 10026 0.504 11 JJSF Consumer_Staples Medium 1.19e7 2.20e 9
## 2 10028 0.0106 11 DGSE Unknown Small 9.71e6 1.62e 7
## 3 10032 0.965 11 PLXS Information_Techn… Small 2.09e7 1.50e 9
## 4 10044 0.551 11 RMCF Consumer_Staples Small 1.41e6 1.25e 7
## 5 10051 1.06 11 HNGR Unknown Small 4.25e7 9.85e 8
## 6 10104 0.173 11 ORCL Information_Techn… Large 1.37e9 7.47e10
## 7 10107 0.320 11 MSFT Information_Techn… Large 3.00e9 4.57e11
## 8 10138 0.267 11 TROW Financials Large 1.21e8 1.46e10
## 9 10145 0.512 11 HON Industrials Large 3.35e8 5.77e10
## 10 10158 0.259 11 AMRC Industrials Small 1.69e7 2.88e 8
## # ℹ 47 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## # volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## # abs_corr_sp <dbl>, security_type <chr>, tracking_error <dbl>,
## # log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## # log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## # log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## # log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …
Correlation of variable
#Find the correlation between retruns and each potential explanatory variables
cor_p2_m6 <- cor(task2_reg_p2_df2%>% select(-c(PERMNO, SHRCD, TICKER, SECTOR, company_size, security_type)), use = "pairwise.complete.obs")
# Correlation over absolute 0.7 = high risk of multicollinearity
high_corr_p2_m6 <- which(abs(cor_p2_m6) > 0.7, arr.ind = TRUE)
high_corr_pairs_p2_m6 <- data.frame(
Feature1 = rownames(cor_p2_m6)[high_corr_p2_m6[,1]],
Feature2 = colnames(cor_p2_m6)[high_corr_p2_m6[,2]],
Correlation = cor_p2_m6[high_corr_p2_m6]
)
high_corr_pairs_p2_m6 <- high_corr_pairs_p2_m6 %>% filter(Correlation < 1)
# Inspect the variables with multicollinearity
head(high_corr_pairs_p2_m6)## Feature1 Feature2 Correlation
## 1 market_cap dollar_vol 0.7210814
## 2 scaled_market_cap dollar_vol 0.7210814
## 3 log_spread bid_ask_spread 0.7265476
## 4 vol_cap_ratio turnover_ratio 0.7700479
## 5 log_volatility volatility 0.8074679
## 6 log_volatility sharpe_ratio 0.7121814
## Var1 Var2 value
## 1 returns returns 1.000000000
## 2 VOL returns -0.003687189
## 3 dollar_vol returns -0.005025484
## 4 bid_ask_spread returns 0.025407662
## 5 turnover_ratio returns -0.019845336
## 6 volatility returns 0.418324811
Build Model 6
# Splitting the dataset into training and testing sets
set.seed(847)
# COVID Period
X_p2_m6 <- task2_reg_p2_df3 %>%
select(Mkt_beta, SMB_beta, HML_beta, log_volatility, log_vol_cap_ratio, scaled_abs_corr_sp, company_sizeSmall, company_sizeLarge, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown)
y_p2_m6 <- task2_reg_p2_df3$returns
splitIndex_p2_m6 <- createDataPartition(y_p2_m6,
p = 0.8,
list = FALSE)
X_train_data_p2_m6 <- X_p2_m6[splitIndex_p2_m6, ]
X_test_data_p2_m6 <- X_p2_m6[-splitIndex_p2_m6, ]
y_train_data_p2_m6 <- y_p2_m6[splitIndex_p2_m6]
y_test_data_p2_m6 <- y_p2_m6[-splitIndex_p2_m6]
train_data_p2_m6 <- cbind(X_train_data_p2_m6, returns = y_train_data_p2_m6)
# Build Model
model_6_p2 <- lm(returns ~ ., data = train_data_p2_m6)
# Prediction for training sample
pred_train_p2_m6 <- predict(model_6_p2, newdata = X_train_data_p2_m6)
metrics_train_p2_m6 <- postResample(pred = pred_train_p2_m6,
obs = train_data_p2_m6$returns)
metrics_table_train_p2_m6 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_train_p2_m6["RMSE"]^2,
metrics_train_p2_m6["RMSE"],
metrics_train_p2_m6["MAE"],
metrics_train_p2_m6["Rsquared"])
)
# Prediction for testing sample
pred_test_p2_m6 <- predict(model_6_p2, newdata = X_test_data_p2_m6)
metrics_test_p2_m6 <- postResample(pred = pred_test_p2_m6,
obs = y_test_data_p2_m6)
metrics_table_test_p2_m6 <- data.frame(
Metric = c('MSE', 'RMSE', 'MAE', 'R-squared'),
Value = c(metrics_test_p2_m6["RMSE"]^2
, metrics_test_p2_m6["RMSE"], metrics_test_p2_m6["MAE"], metrics_test_p2_m6["Rsquared"])
)## Performance Metrics for the Training Dataset:
## Metric Value
## 1 MSE 0.2097130
## 2 RMSE 0.4579443
## 3 MAE 0.3544116
## 4 R-squared 0.1968778
## Performance Metrics for the Testing Dataset:
## Metric Value
## 1 MSE 0.2096475
## 2 RMSE 0.4578728
## 3 MAE 0.3562869
## 4 R-squared 0.1387898
## Regression coefficient of Model 1:
##
## Call:
## lm(formula = returns ~ ., data = train_data_p2_m6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7174 -0.3063 -0.0697 0.2466 1.6724
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.954031 0.073043 13.061 < 2e-16 ***
## Mkt_beta 4.270158 1.473787 2.897 0.003794 **
## SMB_beta 2.135912 0.907829 2.353 0.018707 *
## HML_beta 2.696746 1.134483 2.377 0.017521 *
## log_volatility 0.180457 0.020176 8.944 < 2e-16 ***
## log_vol_cap_ratio 0.037193 0.006628 5.611 2.22e-08 ***
## scaled_abs_corr_sp 0.189675 0.014024 13.525 < 2e-16 ***
## company_sizeSmall 0.087009 0.026089 3.335 0.000865 ***
## company_sizeLarge -0.112466 0.033023 -3.406 0.000670 ***
## SECTORMaterials 0.143236 0.066822 2.144 0.032159 *
## SECTORIndustrials 0.116480 0.052533 2.217 0.026689 *
## SECTORFinancials 0.140620 0.051212 2.746 0.006076 **
## SECTORInformation_Technology 0.083333 0.052969 1.573 0.115786
## SECTORConsumer_Discretionary 0.357843 0.054229 6.599 4.99e-11 ***
## SECTORHealth_Care 0.018774 0.050840 0.369 0.711956
## SECTOREnergy 0.354840 0.075084 4.726 2.41e-06 ***
## SECTORUtilities -0.144939 0.082070 -1.766 0.077506 .
## SECTORReal_Estate 0.284643 0.104170 2.733 0.006327 **
## SECTORCommunication_Services 0.237609 0.068935 3.447 0.000576 ***
## SECTORUnknown 0.194282 0.048248 4.027 5.81e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4597 on 2651 degrees of freedom
## Multiple R-squared: 0.1969, Adjusted R-squared: 0.1911
## F-statistic: 34.2 on 19 and 2651 DF, p-value: < 2.2e-16
## To make sure the variable are not multicollinear (VIF < 5)
## Mkt_beta SMB_beta
## 1.296221 1.233592
## HML_beta log_volatility
## 1.245251 2.438936
## log_vol_cap_ratio scaled_abs_corr_sp
## 2.186028 2.473524
## company_sizeSmall company_sizeLarge
## 1.901892 1.500542
## SECTORMaterials SECTORIndustrials
## 1.758857 3.284024
## SECTORFinancials SECTORInformation_Technology
## 4.045598 3.286238
## SECTORConsumer_Discretionary SECTORHealth_Care
## 2.890833 4.355018
## SECTOREnergy SECTORUtilities
## 1.641274 1.441101
## SECTORReal_Estate SECTORCommunication_Services
## 1.221472 1.660585
## SECTORUnknown
## 5.584303
## Despite SectorUnknown has a VIF over 5, considering it is actually the group of data with missing value in Sector, while some other industry categories are important to the model. After trial and error, it is better to keep the industry variable in the model.
# Scatterplot of Prediction vs Actual
plot_data_p2_m6 <- data.frame(
Predicted = pred_test_p2_m6,
Actual = y_test_data_p2_m6
)
plot_m6_p2 <- ggplot(data = plot_data_p2_m6,
aes(x = Predicted,
y = Actual)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Evaluating Model 6's Performance: Predicted vs. Actual Returns in the COVID Period",
x = "Predicted returns",
y = "Actual returns")
print(plot_m6_p2)Important variable
# Extract coefficients
coef_m6 <- coef(model_6_p2)
importance_m6 <- data.frame(
Factors = names(coef_m6),
Coefficient = coef_m6
)
importance_m6 <- importance_m6[importance_m6$Factor != "(Intercept)", ]
# Plot with positive and negative impacts
plot_m6_p2_factors <- ggplot(importance_m6, aes(x = reorder(Factors, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Coefficient, 3)),
vjust = ifelse(importance_m6$Coefficient > 0, 0.5, 0.3),
color = "black") +
coord_flip() +
labs(title = "Impact of factors on Returns for Model 6",
x = "Factors",
y = "Coefficient") +
theme_minimal() +
scale_fill_manual(
values = c("TRUE" = "steelblue", "FALSE" = "coral"),
labels = c("TRUE" = "Positive Impact",
"FALSE" = "Negative Impact"),
name = ""
) +
theme(
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
)
print(plot_m6_p2_factors)Group by Sector
# Apply the model particularly by industry
pred_sector_p2_m6 <- cbind(X_test_data_p2_m6, SECTOR = task2_reg_p2_df3$SECTOR[-splitIndex_p2_m6])
data_sector_p2_m6 <- data.frame(
Predicted = pred_test_p2_m6,
Actual = y_test_data_p2_m6,
SECTOR = pred_sector_p2_m6$SECTOR
)
data_sector_p2_m6$SECTOR <- as.character(data_sector_p2_m6$SECTOR)
# Calculate evaluation metrics to see the performance of model apply on SECTOR
metrics_table_sector_p2_m6 <- data_sector_p2_m6 %>%
group_by(SECTOR) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_sector_p2_m6)## # A tibble: 12 × 5
## SECTOR MSE RMSE MAE R_Squared
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Communication_Services 0.312 0.559 0.479 -0.0410
## 2 Consumer_Discretionary 0.224 0.473 0.391 0.0236
## 3 Consumer_Staples 0.204 0.452 0.305 0.0500
## 4 Energy 0.426 0.653 0.584 -0.0114
## 5 Financials 0.128 0.357 0.287 0.0582
## 6 Health_Care 0.237 0.487 0.387 0.153
## 7 Industrials 0.217 0.466 0.365 0.0202
## 8 Information_Technology 0.200 0.447 0.337 -0.0113
## 9 Materials 0.133 0.365 0.304 -0.608
## 10 Real_Estate 0.0577 0.240 0.219 0.427
## 11 Unknown 0.230 0.480 0.364 0.188
## 12 Utilities 0.128 0.358 0.288 -0.119
# Plot the prediction vs actual by SECTOR
plot_m6_p2_sector <- ggplot(data = data_sector_p2_m6,
aes(x = Predicted,
y = Actual,
color = SECTOR)) +
geom_point(size = 2) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 6's Performance: Predicted vs. Actual Returns",
subtitle = "During the COVID Period (Grouped by Industry Sector)",
x = "Predicted returns",
y = "Actual returns",
color = "Industry Sector")
print(plot_m6_p2_sector)Group by Company size
# Apply the model particularly by company size
pred_size_p2_m6 <- cbind(X_test_data_p2_m6, company_size = task2_reg_p2_df3$company_size[-splitIndex_p2_m6])
data_size_p2_m6 <- data.frame(
Predicted = pred_test_p2_m6,
Actual = y_test_data_p2_m6,
company_size = pred_size_p2_m6$company_size
)
# Calculate evaluation metrics to see the performance of model apply on company size
metrics_table_size_p2_m6 <- data_size_p2_m6 %>%
group_by(company_size) %>%
summarise(
MSE = mean((Actual - Predicted)^2),
RMSE = sqrt(MSE),
MAE = mean(abs(Actual - Predicted)),
residuals = sum((Actual - Predicted)^2),
total = sum((Actual - mean(Actual))^2),
R_Squared = 1 - (residuals / total)
) %>%
select(-c(residuals, total))
print(metrics_table_size_p2_m6)## # A tibble: 3 × 5
## company_size MSE RMSE MAE R_Squared
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Medium 0.205 0.452 0.362 0.116
## 2 Small 0.223 0.472 0.367 0.118
## 3 Large 0.141 0.376 0.286 0.0999
# Plot the prediction vs actual by company size
plot_m6_p2_size <- ggplot(data = data_size_p2_m6,
aes(x = Predicted,
y = Actual,
color = company_size)) +
geom_point(size = 2, alpha = 0.7) +
geom_smooth(method = "lm",
se = FALSE,
color = "blue",
aes(linetype = "Regression Line")) +
scale_linetype_manual(name = NULL,
values = "solid",
labels = "Regression Line") +
theme_minimal() +
theme(legend.position = "bottom",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold")) +
labs(title = "Evaluating Model 6's Performance: Predicted vs. Actual Returns",
subtitle = "During the COVID Period (Grouped by Company Size)",
x = "Predicted returns",
y = "Actual returns",
color = "Company Size")
print(plot_m6_p2_size)Different variable in each model
get_variables <- function(model) {
# Extract variables from the model
terms <- attr(model$terms, "term.labels")
return(terms)
}
ols_m1_v <- get_variables(model_1_p1)
ols_m2_v <- get_variables(model_2_p2)
ols_m3_v <- get_variables(model_3_p1)
ols_m4_v <- get_variables(model_4_p2)
ols_m5_v <- get_variables(model_5_p1)
ols_m6_v <- get_variables(model_6_p2)
ols_v_list <- list(
Model_1 = as.character(ols_m1_v),
Model_2 = as.character(ols_m2_v),
Model_3 = as.character(ols_m3_v),
Model_4 = as.character(ols_m4_v),
Model_5 = as.character(ols_m5_v),
Model_6 = as.character(ols_m6_v)
)
# Initialize empty data frame
ols_model_data <- data.frame(
model = character(),
variable = character(),
order = integer(),
stringsAsFactors = FALSE
)
# Populate the dataframe with the original order of variables
for (model in names(ols_v_list)) {
variables <- ols_v_list[[model]]
temp_df <- data.frame(
model = model,
variable = variables,
order = seq_along(variables),
stringsAsFactors = FALSE
)
ols_model_data <- rbind(ols_model_data, temp_df)
}
# Convert variable to a factor with levels ordered by 'order'
ols_model_data$variable <- factor(ols_model_data$variable, levels = sort(unique(ols_model_data$variable)))
# Generate plot
ols_models_v <- ggplot(ols_model_data, aes(x = model, y = variable, fill = model)) +
geom_tile(color = "white") +
labs(
title = "Variables Included in Each OLS Regression Model",
x = "Model",
y = "Variables"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
plot.title = element_text(hjust = 0.5, size = 16),
plot.title.position = "plot")
print(ols_models_v)3 Logistic Regressions
Preparation
Create price change dummy variable
# Use the last and the first date PRC between COVID to create dummy variable represent price increase (1) or decrease (0)
task3_price_dummy <- dataset1_df9 %>%
arrange(PERMNO, date) %>%
group_by(PERMNO) %>%
summarise(
up_PRC = ifelse(last(na.omit(PRC[date <= "2020-03-20"])) > first(na.omit(PRC[date >= "2020-02-14"])), 1, 0))Inspect up_PRC accuracy
up_PRC_summary <- dataset1_df9 %>%
filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
arrange(PERMNO, date) %>%
group_by(PERMNO) %>%
summarize(
first_PRC = first(na.omit(PRC)),
first_date = date[which(na.omit(PRC) == first_PRC)[1]],
last_PRC = last(na.omit(PRC)),
last_date = date[which(na.omit(PRC) == last_PRC)[1]],
.groups = 'drop'
) %>%
mutate(
up_PRC = ifelse(last_PRC > first_PRC, 1, 0)
)
up_PRC_check <- task3_price_dummy %>%
distinct(PERMNO, up_PRC) %>%
left_join(up_PRC_summary %>% select(PERMNO, up_PRC), by = "PERMNO", suffix = c("_dataset1", "_dataset2")) %>%
filter(up_PRC_dataset1 != up_PRC_dataset2) %>%
select(PERMNO, up_PRC_dataset1, up_PRC_dataset2)
if (nrow(up_PRC_check) > 0) {
print("These PERMNO has wrong up_PRC")
print(up_PRC_check$PERMNO)
} else {
print("All values of up_PRC are checked and accurate.")
}## [1] "All values of up_PRC are checked and accurate."
Create dataset
## # A tibble: 5 × 58
## PERMNO returns SHRCD TICKER SECTOR company_size VOL dollar_vol
## <chr> <dbl> <int> <chr> <fct> <fct> <dbl> <dbl>
## 1 10026 0.504 11 JJSF Consumer_Staples Medium 1.19e7 2.20e9
## 2 10028 0.0106 11 DGSE Unknown Small 9.71e6 1.62e7
## 3 10032 0.965 11 PLXS Information_Techno… Small 2.09e7 1.50e9
## 4 10044 0.551 11 RMCF Consumer_Staples Small 1.41e6 1.25e7
## 5 10051 1.06 11 HNGR Unknown Small 4.25e7 9.85e8
## # ℹ 50 more variables: bid_ask_spread <dbl>, turnover_ratio <dbl>,
## # volatility <dbl>, sharpe_ratio <dbl>, market_share <dbl>, market_cap <dbl>,
## # abs_corr_sp <dbl>, security_type <fct>, tracking_error <dbl>,
## # log_vol <dbl>, log_dollar_vol <dbl>, log_spread <dbl>,
## # log_turnover_ratio <dbl>, log_volatility <dbl>, log_market_share <dbl>,
## # log_market_cap <dbl>, log_tracking_error <dbl>, vol_cap_ratio <dbl>,
## # log_vol_cap_ratio <dbl>, scaled_vol <dbl[,1]>, …
Model 1: Price Change dummy variable: 2020-02-14 to 2020-03-20 & X variable: before 2020-02-14
Build Model 1 (Without industry)
# Select independent variable
X_logit_m1 <- task3 %>%
select(Mkt_beta, SMB_beta, HML_beta, log_spread, log_volatility, scaled_market_cap, log_tracking_error, log_turnover_ratio)
y_logit_m1 <- task3$up_PRC
# Split the dataset into training and testing sets
set.seed(675)
trainIndex <- createDataPartition(y_logit_m1, p = 0.8, list = FALSE)
X_train_logit_m1 <- X_logit_m1[trainIndex, ]
X_test_logit_m1 <- X_logit_m1[-trainIndex, ]
y_train_logit_m1 <- y_logit_m1[trainIndex]
y_test_logit_m1 <- y_logit_m1[-trainIndex]
# Run the logistic regression model
logit_model_1 <- glm(y_train_logit_m1 ~ .,
data = as.data.frame(X_train_logit_m1, y_train_logit_m1),
family = binomial())
# Display model summary
summary(logit_model_1)##
## Call:
## glm(formula = y_train_logit_m1 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m1,
## y_train_logit_m1))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.82283 0.70763 3.989 6.63e-05 ***
## Mkt_beta -101.76338 11.80252 -8.622 < 2e-16 ***
## SMB_beta -5.93703 7.78596 -0.763 0.4457
## HML_beta -3.06160 10.11749 -0.303 0.7622
## log_spread -0.41756 0.09364 -4.459 8.22e-06 ***
## log_volatility -0.50257 0.09543 -5.266 1.39e-07 ***
## scaled_market_cap -0.25935 0.35696 -0.727 0.4675
## log_tracking_error 2.34798 0.21690 10.825 < 2e-16 ***
## log_turnover_ratio 0.18226 0.08268 2.204 0.0275 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1432.0 on 4151 degrees of freedom
## Residual deviance: 1011.6 on 4143 degrees of freedom
## AIC: 1029.6
##
## Number of Fisher Scoring iterations: 8
## Mkt_beta SMB_beta HML_beta log_spread
## 1.286861 1.163409 1.053116 2.744029
## log_volatility scaled_market_cap log_tracking_error log_turnover_ratio
## 2.709422 1.239080 3.067389 1.328285
# Calculate predicted probabilities on testing data
pred_prob_m1 <- predict(logit_model_1,
newdata = as.data.frame(X_test_logit_m1),
type = "response")
# Determine classification based on threshold of 0.5
class_m1 <- ifelse(pred_prob_m1 > 0.5, 1, 0)
# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m1 <- sum(y_test_logit_m1 == 1 & class_m1 == 1)
true_negatives_m1 <- sum(y_test_logit_m1 == 0 & class_m1 == 0)
false_positives_m1 <- sum(y_test_logit_m1 == 0 & class_m1 == 1)
false_negatives_m1 <- sum(y_test_logit_m1 == 1 & class_m1 == 0)
# Calculate Accuracy
accuracy_m1 <- (true_positives_m1 + true_negatives_m1) / (true_positives_m1 + true_negatives_m1 + false_positives_m1 + false_negatives_m1)
# Calculate Precision
precision_m1 <- ifelse((true_positives_m1 + false_positives_m1) == 0, 0, true_positives_m1 / (true_positives_m1 + false_positives_m1))
# Calculate Recall
recall_m1 <- ifelse((true_positives_m1 + false_negatives_m1) == 0, 0, true_positives_m1 / (true_positives_m1 + false_negatives_m1))
# Calculate F1 Score
f1_m1 <- 2 * (precision_m1 * recall_m1) / (precision_m1 + recall_m1)
# Compute ROC curve and AUC
roc_curve_m1 <- roc(y_test_logit_m1, pred_prob_m1)
auc_value_m1 <- auc(roc_curve_m1)
# Calculate McFadden’s R-squared
null_model_1 <- glm(y_train_logit_m1 ~ 1, data = as.data.frame(cbind(X_train_logit_m1, y_train_logit_m1)), family = binomial())
rsquared_logit_m1 <- 1 - (logLik(logit_model_1) / logLik(null_model_1))
# Create a dataframe for counts
metrics_counts_logit_m1 <- data.frame(
Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
Count = c(
as.integer(true_positives_m1),
as.integer(true_negatives_m1),
as.integer(false_positives_m1),
as.integer(false_negatives_m1)
)
)
# Create a dataframe for performance metrics
metrics_evaluation_logit_m1 <- data.frame(
Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
Value = c(
accuracy_m1,
precision_m1,
recall_m1,
f1_m1,
auc_value_m1,
rsquared_logit_m1
)
)
# Print the results
print(metrics_counts_logit_m1)## Metric Count
## 1 True Positives 9
## 2 True Negatives 1011
## 3 False Positives 2
## 4 False Negatives 16
## Metric Value
## 1 Accuracy 0.9826590
## 2 Precision 0.8181818
## 3 Recall 0.3600000
## 4 F1 Score 0.5000000
## 5 AUC 0.9167621
## 6 R_squared 0.2935775
Confusion Matrix
true_false_m1 <- matrix(c(true_negatives_m1, false_positives_m1, false_negatives_m1, true_positives_m1),
nrow = 2,
dimnames = list(Predicted = c("Negative", "Positive"),
Actual = c("Negative", "Positive")))
conf_matrix_m1 <- as.data.frame(as.table(true_false_m1))
# Plot the confusion matrix
palette <- brewer.pal(n = 4,
name = "Greens") # Define color
conf_matrix_m1_plot <- ggplot(conf_matrix_m1,
aes(x = Predicted,
y = Actual,
fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq),
color = "black",
size = 6) +
scale_fill_gradientn(colors = brewer.pal(n = 4,
name = "Greens"),
guide = "none") +
labs(title = "Confusion Matrix for Model 1: Price Increase Predictions between February 14, 2020 to March 20, 2020",
x = "Predicted",
y = "Actual") +
theme_minimal() +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
print(conf_matrix_m1_plot) Important Factor
# Extract coefficients
coefficients_logit_m1 <- coef(logit_model_1)
# Convert coefficients to odds ratios
odds_ratios_m1 <- exp(coefficients_logit_m1)
# Identify intercept and filter it out
importance_logit_m1 <- data.frame(
Factors = names(coefficients_logit_m1),
Odds_Ratio = odds_ratios_m1
)
# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m1 <- importance_logit_m1[importance_logit_m1$Factor != "(Intercept)", ]
# Add a column for fill color based on the odds ratio
importance_logit_m1$Fill_Color <- ifelse(importance_logit_m1$Odds_Ratio > 1, "steelblue", "coral")
# Plot the factor
ggplot(importance_logit_m1,
aes(x = reorder(Factors, Odds_Ratio),
y = Odds_Ratio,
fill = Fill_Color)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_text(aes(label = round(Odds_Ratio, 2)),
hjust = 1,
size = 3) +
labs(title = "Impact of Factors on Price increase likelihood for Model 1: Positive Vs Negative Associations",
x = "Factors",
y = "Odds Ratio\n(The exponentiation of coefficients)") +
geom_hline(yintercept = 1,
linetype = "dashed",
color = "red") +
annotate("text",
x = Inf,
y = 1,
label = "Odds Ratio = 1",
hjust = 0,
vjust = 0,
color = "grey",
size = 3.5,
fontface = "italic") +
theme_minimal() +
scale_fill_manual(
values = c("steelblue" = "steelblue", "coral" = "coral"),
labels = c("steelblue" = "Positively Associated with Price Increase",
"coral" = "Negatively Associated with Price Increase"),
name = "") +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
) Model 2: Continuous Variables from Model 1 with Industry Variable
# Select independent variable
X_logit_m2 <- task3 %>%
select(Mkt_beta, SMB_beta, HML_beta, log_spread, log_volatility, scaled_market_cap, log_tracking_error, log_turnover_ratio, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown
)
y_logit_m2 <- task3$up_PRC
# Split the dataset into training and testing sets
set.seed(675)
trainIndex <- createDataPartition(y_logit_m2, p = 0.8, list = FALSE)
X_train_logit_m2 <- X_logit_m2[trainIndex, ]
X_test_logit_m2 <- X_logit_m2[-trainIndex, ]
y_train_logit_m2 <- y_logit_m2[trainIndex]
y_test_logit_m2 <- y_logit_m2[-trainIndex]
# Fit the logistic regression model
logit_model_2 <- glm(y_train_logit_m2 ~ .,
data = as.data.frame(X_train_logit_m2, y_train_logit_m2),
family = binomial())
# Display model summary
summary(logit_model_2)##
## Call:
## glm(formula = y_train_logit_m2 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m2,
## y_train_logit_m2))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.91523 0.89242 4.387 1.15e-05 ***
## Mkt_beta -96.88856 12.07981 -8.021 1.05e-15 ***
## SMB_beta -7.54789 8.04238 -0.939 0.347980
## HML_beta 1.58545 10.76882 0.147 0.882954
## log_spread -0.46597 0.09768 -4.770 1.84e-06 ***
## log_volatility -0.43685 0.10269 -4.254 2.10e-05 ***
## scaled_market_cap -0.30859 0.35838 -0.861 0.389204
## log_tracking_error 2.42835 0.22352 10.864 < 2e-16 ***
## log_turnover_ratio 0.09634 0.08735 1.103 0.270075
## SECTORMaterials -1.95078 0.71579 -2.725 0.006423 **
## SECTORIndustrials -3.11610 0.80881 -3.853 0.000117 ***
## SECTORFinancials -2.55834 0.66841 -3.827 0.000129 ***
## SECTORInformation_Technology -2.33281 0.63495 -3.674 0.000239 ***
## SECTORConsumer_Discretionary -3.57383 1.05670 -3.382 0.000719 ***
## SECTORHealth_Care -1.13104 0.40944 -2.762 0.005738 **
## SECTOREnergy -1.89771 0.90855 -2.089 0.036733 *
## SECTORUtilities -2.09672 1.06645 -1.966 0.049291 *
## SECTORReal_Estate -15.67593 847.50173 -0.018 0.985243
## SECTORCommunication_Services -15.78635 430.03630 -0.037 0.970717
## SECTORUnknown -1.23672 0.36444 -3.393 0.000690 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1432.02 on 4151 degrees of freedom
## Residual deviance: 957.46 on 4132 degrees of freedom
## AIC: 997.46
##
## Number of Fisher Scoring iterations: 16
## Mkt_beta SMB_beta
## 1.314022 1.235449
## HML_beta log_spread
## 1.131153 2.813206
## log_volatility scaled_market_cap
## 3.221248 1.276669
## log_tracking_error log_turnover_ratio
## 3.152487 1.423055
## SECTORMaterials SECTORIndustrials
## 1.263827 1.190700
## SECTORFinancials SECTORInformation_Technology
## 1.287306 1.354766
## SECTORConsumer_Discretionary SECTORHealth_Care
## 1.095006 2.923145
## SECTOREnergy SECTORUtilities
## 1.250603 1.097808
## SECTORReal_Estate SECTORCommunication_Services
## 1.000000 1.000001
## SECTORUnknown
## 3.807168
# Calculate predicted probabilities on testing data
pred_prob_m2 <- predict(logit_model_2,
newdata = as.data.frame(X_test_logit_m2),
type = "response")
# Determine classification based on threshold of 0.5
class_m2 <- ifelse(pred_prob_m2 > 0.5, 1, 0)
# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m2 <- sum(y_test_logit_m2 == 1 & class_m2 == 1)
true_negatives_m2 <- sum(y_test_logit_m2 == 0 & class_m2 == 0)
false_positives_m2 <- sum(y_test_logit_m2 == 0 & class_m2 == 1)
false_negatives_m2 <- sum(y_test_logit_m2 == 1 & class_m2 == 0)
# Calculate Accuracy
accuracy_m2 <- (true_positives_m2 + true_negatives_m2) / (true_positives_m2 + true_negatives_m2 + false_positives_m2 + false_negatives_m2)
# Calculate Precision
precision_m2 <- ifelse((true_positives_m2 + false_positives_m2) == 0, 0, true_positives_m2 / (true_positives_m2 + false_positives_m2))
# Calculate Recall
recall_m2 <- ifelse((true_positives_m2 + false_negatives_m2) == 0, 0, true_positives_m2 / (true_positives_m2 + false_negatives_m2))
# Calculate F1 Score
f1_m2 <- 2 * (precision_m2 * recall_m2) / (precision_m2 + recall_m2)
# Compute ROC curve and AUC
roc_curve_m2 <- roc(y_test_logit_m2, pred_prob_m2)
auc_value_m2 <- auc(roc_curve_m2)
# Calculate McFadden’s R-squared
null_model_2 <- glm(y_train_logit_m2 ~ 1, data = as.data.frame(cbind(X_train_logit_m2, y_train_logit_m2)), family = binomial())
rsquared_logit_m2 <- 1 - (logLik(logit_model_2) / logLik(null_model_2))
# Create a dataframe for counts
metrics_counts_logit_m2 <- data.frame(
Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
Count = c(
as.integer(true_positives_m2),
as.integer(true_negatives_m2),
as.integer(false_positives_m2),
as.integer(false_negatives_m2)
)
)
# Create a dataframe for performance metrics
metrics_evaluation_logit_m2 <- data.frame(
Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
Value = c(
accuracy_m2,
precision_m2,
recall_m2,
f1_m2,
auc_value_m2,
rsquared_logit_m2
)
)
# Print the results
print(metrics_counts_logit_m2)## Metric Count
## 1 True Positives 10
## 2 True Negatives 1011
## 3 False Positives 2
## 4 False Negatives 15
## Metric Value
## 1 Accuracy 0.9836224
## 2 Precision 0.8333333
## 3 Recall 0.4000000
## 4 F1 Score 0.5405405
## 5 AUC 0.8991115
## 6 R_squared 0.3313900
Confusion Matrix
true_false_m2 <- matrix(c(true_negatives_m2, false_positives_m2, false_negatives_m2, true_positives_m2),
nrow = 2,
dimnames = list(Predicted = c("Negative", "Positive"),
Actual = c("Negative", "Positive")))
conf_matrix_m2 <- as.data.frame(as.table(true_false_m2))
# Plot the confusion matrix
palette <- brewer.pal(n = 4,
name = "Greens") # Define color
conf_matrix_m2_plot <- ggplot(conf_matrix_m2,
aes(x = Predicted,
y = Actual,
fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq),
color = "black",
size = 6) +
scale_fill_gradientn(colors = brewer.pal(n = 4,
name = "Greens"),
guide = "none") +
labs(title = "Confusion Matrix for Model 2: Price Increase Predictions between February 14, 2020 to March 20, 2020",
subtitle = "(Model 2: Continuous Variables from Model 1 with Industry Variable)",
x = "Predicted",
y = "Actual") +
theme_minimal() +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
print(conf_matrix_m2_plot) Important Factor
# Extract coefficients
coefficients_logit_m2 <- coef(logit_model_2)
# Convert coefficients to odds ratios
odds_ratios_m2 <- exp(coefficients_logit_m2)
# Identify intercept and filter it out
importance_logit_m2 <- data.frame(
Factors = names(coefficients_logit_m2),
Odds_Ratio = odds_ratios_m2
)
# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m2 <- importance_logit_m2[importance_logit_m2$Factor != "(Intercept)", ]
# Add a column for fill color based on the odds ratio
importance_logit_m2$Fill_Color <- ifelse(importance_logit_m2$Odds_Ratio > 1, "steelblue", "coral")
# Plot the factor
ggplot(importance_logit_m2,
aes(x = reorder(Factors, Odds_Ratio),
y = Odds_Ratio,
fill = Fill_Color)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_text(aes(label = round(Odds_Ratio, 2)),
hjust = 1,
size = 3) +
labs(title = "Impact of Factors on Price increase likelihood for Model 2: Positive Vs Negative Associations",
subtitle = "(Model 2: Continuous Variables from Model 1 with Industry Variable)",
x = "Factors",
y = "Odds Ratio\n(The exponentiation of coefficients)") +
geom_hline(yintercept = 1,
linetype = "dashed",
color = "red") +
annotate("text",
x = Inf,
y = 1,
label = "Odds Ratio = 1",
hjust = 0,
vjust = 0,
color = "grey",
size = 3.5,
fontface = "italic") +
theme_minimal() +
scale_fill_manual(
values = c("steelblue" = "steelblue", "coral" = "coral"),
labels = c("steelblue" = "Positively Associated with Price Increase",
"coral" = "Negatively Associated with Price Increase"),
name = "") +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
) Model 3: Different Continuous Variables with Industry Variable Compared to Model 2
# Select independent variable
X_logit_m3 <- task3 %>%
select(Mkt_beta, SMB_beta, scaled_dollar_vol, log_spread, log_volatility, log_tracking_error, log_turnover_ratio, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown
)
y_logit_m3 <- task3$up_PRC
# Split the dataset into training and testing sets
set.seed(675)
trainIndex <- createDataPartition(y_logit_m3, p = 0.8, list = FALSE)
X_train_logit_m3 <- X_logit_m3[trainIndex, ]
X_test_logit_m3 <- X_logit_m3[-trainIndex, ]
y_train_logit_m3 <- y_logit_m3[trainIndex]
y_test_logit_m3 <- y_logit_m3[-trainIndex]
# Fit the logistic regression model
logit_model_3 <- glm(y_train_logit_m3 ~ .,
data = as.data.frame(X_train_logit_m3, y_train_logit_m3),
family = binomial())
# Display model summary
summary(logit_model_3)##
## Call:
## glm(formula = y_train_logit_m3 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m3,
## y_train_logit_m3))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.05066 0.86775 4.668 3.04e-06 ***
## Mkt_beta -96.67465 11.96738 -8.078 6.57e-16 ***
## SMB_beta -7.05142 7.96289 -0.886 0.375868
## scaled_dollar_vol -0.16817 0.24663 -0.682 0.495319
## log_spread -0.45490 0.09665 -4.707 2.52e-06 ***
## log_volatility -0.44120 0.10272 -4.295 1.74e-05 ***
## log_tracking_error 2.43435 0.22285 10.924 < 2e-16 ***
## log_turnover_ratio 0.11227 0.08640 1.299 0.193833
## SECTORMaterials -1.92123 0.71533 -2.686 0.007236 **
## SECTORIndustrials -3.08373 0.80834 -3.815 0.000136 ***
## SECTORFinancials -2.51658 0.66675 -3.774 0.000160 ***
## SECTORInformation_Technology -2.30834 0.63461 -3.637 0.000275 ***
## SECTORConsumer_Discretionary -3.53092 1.05639 -3.342 0.000830 ***
## SECTORHealth_Care -1.11326 0.40781 -2.730 0.006336 **
## SECTOREnergy -1.85228 0.88731 -2.088 0.036840 *
## SECTORUtilities -2.11536 1.06523 -1.986 0.047052 *
## SECTORReal_Estate -15.62660 847.77716 -0.018 0.985294
## SECTORCommunication_Services -15.75684 432.20897 -0.036 0.970918
## SECTORUnknown -1.19074 0.36180 -3.291 0.000998 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1432.02 on 4151 degrees of freedom
## Residual deviance: 958.12 on 4133 degrees of freedom
## AIC: 996.12
##
## Number of Fisher Scoring iterations: 16
## Mkt_beta SMB_beta
## 1.287699 1.224224
## scaled_dollar_vol log_spread
## 1.222968 2.780091
## log_volatility log_tracking_error
## 3.226058 3.154757
## log_turnover_ratio SECTORMaterials
## 1.387073 1.261708
## SECTORIndustrials SECTORFinancials
## 1.189484 1.280925
## SECTORInformation_Technology SECTORConsumer_Discretionary
## 1.348813 1.093860
## SECTORHealth_Care SECTOREnergy
## 2.898106 1.199260
## SECTORUtilities SECTORReal_Estate
## 1.096418 1.000000
## SECTORCommunication_Services SECTORUnknown
## 1.000001 3.754395
# Calculate predicted probabilities on testing data
pred_prob_m3 <- predict(logit_model_3,
newdata = as.data.frame(X_test_logit_m3),
type = "response")
# Determine classification based on threshold of 0.5
class_m3 <- ifelse(pred_prob_m3 > 0.5, 1, 0)
# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m3 <- sum(y_test_logit_m3 == 1 & class_m3 == 1)
true_negatives_m3 <- sum(y_test_logit_m3 == 0 & class_m3 == 0)
false_positives_m3 <- sum(y_test_logit_m3 == 0 & class_m3 == 1)
false_negatives_m3 <- sum(y_test_logit_m3 == 1 & class_m3 == 0)
# Calculate Accuracy
accuracy_m3 <- (true_positives_m3 + true_negatives_m3) / (true_positives_m3 + true_negatives_m3 + false_positives_m3 + false_negatives_m3)
# Calculate Precision
precision_m3 <- ifelse((true_positives_m3 + false_positives_m3) == 0, 0, true_positives_m3 / (true_positives_m3 + false_positives_m3))
# Calculate Recall
recall_m3 <- ifelse((true_positives_m3 + false_negatives_m3) == 0, 0, true_positives_m3 / (true_positives_m3 + false_negatives_m3))
# Calculate F1 Score
f1_m3 <- 2 * (precision_m3 * recall_m3) / (precision_m3 + recall_m3)
# Compute ROC curve and AUC
roc_curve_m3 <- roc(y_test_logit_m3, pred_prob_m3)
auc_value_m3 <- auc(roc_curve_m3)
# Calculate McFadden’s R-squared
null_model_3 <- glm(y_train_logit_m3 ~ 1, data = as.data.frame(cbind(X_train_logit_m3, y_train_logit_m3)), family = binomial())
rsquared_logit_m3 <- 1 - (logLik(logit_model_3) / logLik(null_model_3))
# Create a dataframe for counts
metrics_counts_logit_m3 <- data.frame(
Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
Count = c(
as.integer(true_positives_m3),
as.integer(true_negatives_m3),
as.integer(false_positives_m3),
as.integer(false_negatives_m3)
)
)
# Create a dataframe for performance metrics
metrics_evaluation_logit_m3 <- data.frame(
Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
Value = c(
accuracy_m3,
precision_m3,
recall_m3,
f1_m3,
auc_value_m3,
rsquared_logit_m3
)
)
# Print the results
print(metrics_counts_logit_m3)## Metric Count
## 1 True Positives 11
## 2 True Negatives 1011
## 3 False Positives 2
## 4 False Negatives 14
## Metric Value
## 1 Accuracy 0.9845857
## 2 Precision 0.8461538
## 3 Recall 0.4400000
## 4 F1 Score 0.5789474
## 5 AUC 0.8986772
## 6 R_squared 0.3309308
Confusion Matrix
true_false_m3 <- matrix(c(true_negatives_m3, false_positives_m3, false_negatives_m3, true_positives_m3),
nrow = 2,
dimnames = list(Predicted = c("Negative", "Positive"),
Actual = c("Negative", "Positive")))
conf_matrix_m3 <- as.data.frame(as.table(true_false_m3))
# Plot the confusion matrix
palette <- brewer.pal(n = 4,
name = "Greens") # Define color
conf_matrix_m3_plot <- ggplot(conf_matrix_m3,
aes(x = Predicted,
y = Actual,
fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq),
color = "black",
size = 6) +
scale_fill_gradientn(colors = brewer.pal(n = 4,
name = "Greens"),
guide = "none") +
labs(title = "Confusion Matrix for Model 3: Price Increase Predictions between February 14, 2020 to March 20, 2020",
subtitle = "(Model 3: Different Continuous Variables with Industry Variable Compared to Model 2)",
x = "Predicted",
y = "Actual") +
theme_minimal() +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
print(conf_matrix_m3_plot) Important Factor
# Extract coefficients
coefficients_logit_m3 <- coef(logit_model_3)
# Convert coefficients to odds ratios
odds_ratios_m3 <- exp(coefficients_logit_m3)
# Identify intercept and filter it out
importance_logit_m3 <- data.frame(
Factors = names(coefficients_logit_m3),
Odds_Ratio = odds_ratios_m3
)
# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m3 <- importance_logit_m3[importance_logit_m3$Factor != "(Intercept)", ]
# Add a column for fill color based on the odds ratio
importance_logit_m3$Fill_Color <- ifelse(importance_logit_m3$Odds_Ratio > 1, "steelblue", "coral")
# Plot with positive and negative impacts
# Plot the factor
ggplot(importance_logit_m3,
aes(x = reorder(Factors, Odds_Ratio),
y = Odds_Ratio,
fill = Fill_Color)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_text(aes(label = round(Odds_Ratio, 2)),
hjust = 1,
size = 3) +
labs(title = "Impact of Factors on Price increase likelihood for Model 3: Positive Vs Negative Associations",
subtitle = "((Model 3: Different Continuous Variables with Industry Variable Compared to Model 2))",
x = "Factors",
y = "Odds Ratio\n(The exponentiation of coefficients)") +
geom_hline(yintercept = 1,
linetype = "dashed",
color = "red") +
annotate("text",
x = Inf,
y = 1,
label = "Odds Ratio = 1",
hjust = 0,
vjust = 0,
color = "grey",
size = 3.5,
fontface = "italic") +
theme_minimal() +
scale_fill_manual(
values = c("steelblue" = "steelblue", "coral" = "coral"),
labels = c("steelblue" = "Positively Associated with Price Increase",
"coral" = "Negatively Associated with Price Increase"),
name = "") +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
) Model 4: Continuous Variables from Model 3 without Industry Variable
# Select independent variable
X_logit_m4 <- task3 %>%
select(Mkt_beta, SMB_beta, scaled_dollar_vol, log_spread, log_volatility, log_tracking_error, log_turnover_ratio)
y_logit_m4 <- task3$up_PRC
# Split the dataset into training and testing sets
set.seed(675)
trainIndex <- createDataPartition(y_logit_m4, p = 0.8, list = FALSE)
X_train_logit_m4 <- X_logit_m4[trainIndex, ]
X_test_logit_m4 <- X_logit_m4[-trainIndex, ]
y_train_logit_m4 <- y_logit_m4[trainIndex]
y_test_logit_m4 <- y_logit_m4[-trainIndex]
# Fit the logistic regression model
logit_model_4 <- glm(y_train_logit_m4 ~ .,
data = as.data.frame(X_train_logit_m4, y_train_logit_m4),
family = binomial())
# Display model summary
summary(logit_model_4)##
## Call:
## glm(formula = y_train_logit_m4 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m4,
## y_train_logit_m4))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.93863 0.66511 4.418 9.95e-06 ***
## Mkt_beta -102.74998 11.66734 -8.807 < 2e-16 ***
## SMB_beta -5.73941 7.76574 -0.739 0.4599
## scaled_dollar_vol -0.14856 0.22590 -0.658 0.5108
## log_spread -0.40899 0.09140 -4.475 7.65e-06 ***
## log_volatility -0.50583 0.09463 -5.345 9.03e-08 ***
## log_tracking_error 2.34476 0.21631 10.840 < 2e-16 ***
## log_turnover_ratio 0.19554 0.08185 2.389 0.0169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1432.0 on 4151 degrees of freedom
## Residual deviance: 1011.9 on 4144 degrees of freedom
## AIC: 1027.9
##
## Number of Fisher Scoring iterations: 7
## Mkt_beta SMB_beta scaled_dollar_vol log_spread
## 1.248160 1.155427 1.178263 2.626577
## log_volatility log_tracking_error log_turnover_ratio
## 2.661042 3.058806 1.297536
# Calculate predicted probabilities on testing data
pred_prob_m4 <- predict(logit_model_4,
newdata = as.data.frame(X_test_logit_m4),
type = "response")
# Determine classification based on threshold of 0.5
class_m4 <- ifelse(pred_prob_m4 > 0.5, 1, 0)
# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m4 <- sum(y_test_logit_m4 == 1 & class_m4 == 1)
true_negatives_m4 <- sum(y_test_logit_m4 == 0 & class_m4 == 0)
false_positives_m4 <- sum(y_test_logit_m4 == 0 & class_m4 == 1)
false_negatives_m4 <- sum(y_test_logit_m4 == 1 & class_m4 == 0)
# Calculate Accuracy
accuracy_m4 <- (true_positives_m4 + true_negatives_m4) / (true_positives_m4 + true_negatives_m4 + false_positives_m4 + false_negatives_m4)
# Calculate Precision
precision_m4 <- ifelse((true_positives_m4 + false_positives_m4) == 0, 0, true_positives_m4 / (true_positives_m4 + false_positives_m4))
# Calculate Recall
recall_m4 <- ifelse((true_positives_m4 + false_negatives_m4) == 0, 0, true_positives_m4 / (true_positives_m4 + false_negatives_m4))
# Calculate F1 Score
f1_m4 <- 2 * (precision_m4 * recall_m4) / (precision_m4 + recall_m4)
# Compute ROC curve and AUC
roc_curve_m4 <- roc(y_test_logit_m4, pred_prob_m4)
auc_value_m4 <- auc(roc_curve_m4)
# Calculate McFadden’s R-squared
null_model_4 <- glm(y_train_logit_m4 ~ 1, data = as.data.frame(cbind(X_train_logit_m4, y_train_logit_m4)), family = binomial())
rsquared_logit_m4 <- 1 - (logLik(logit_model_4) / logLik(null_model_4))
# Create a dataframe for counts
metrics_counts_logit_m4 <- data.frame(
Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
Count = c(
as.integer(true_positives_m4),
as.integer(true_negatives_m4),
as.integer(false_positives_m4),
as.integer(false_negatives_m4)
)
)
# Create a dataframe for performance metrics
metrics_evaluation_logit_m4 <- data.frame(
Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
Value = c(
accuracy_m4,
precision_m4,
recall_m4,
f1_m4,
auc_value_m4,
rsquared_logit_m4
)
)
# Print the results
print(metrics_counts_logit_m4)## Metric Count
## 1 True Positives 9
## 2 True Negatives 1011
## 3 False Positives 2
## 4 False Negatives 16
## Metric Value
## 1 Accuracy 0.9826590
## 2 Precision 0.8181818
## 3 Recall 0.3600000
## 4 F1 Score 0.5000000
## 5 AUC 0.9169990
## 6 R_squared 0.2933615
Confusion Matrix
true_false_m4 <- matrix(c(true_negatives_m4, false_positives_m4, false_negatives_m4, true_positives_m4),
nrow = 2,
dimnames = list(Predicted = c("Negative", "Positive"),
Actual = c("Negative", "Positive")))
conf_matrix_m4 <- as.data.frame(as.table(true_false_m4))
# Plot the confusion matrix
palette <- brewer.pal(n = 4,
name = "Greens") # Define color
conf_matrix_m4_plot <- ggplot(conf_matrix_m4,
aes(x = Predicted,
y = Actual,
fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq),
color = "black",
size = 6) +
scale_fill_gradientn(colors = brewer.pal(n = 4,
name = "Greens"),
guide = "none") +
labs(title = "Confusion Matrix for Model 4: Price Increase Predictions between February 14, 2020 to March 20, 2020",
subtitle = "(Model 4: Continuous Variables from Model 3 without Industry Variable)",
x = "Predicted",
y = "Actual") +
theme_minimal() +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
print(conf_matrix_m4_plot) Important Factor
# Extract coefficients
coefficients_logit_m4 <- coef(logit_model_4)
# Convert coefficients to odds ratios
odds_ratios_m4 <- exp(coefficients_logit_m4)
# Identify intercept and filter it out
importance_logit_m4 <- data.frame(
Factors = names(coefficients_logit_m4),
Odds_Ratio = odds_ratios_m4
)
# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m4 <- importance_logit_m4[importance_logit_m4$Factor != "(Intercept)", ]
# Add a column for fill color based on the odds ratio
importance_logit_m4$Fill_Color <- ifelse(importance_logit_m4$Odds_Ratio > 1, "steelblue", "coral")
# Plot with positive and negative impacts
# Plot the factor
ggplot(importance_logit_m4,
aes(x = reorder(Factors, Odds_Ratio),
y = Odds_Ratio,
fill = Fill_Color)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_text(aes(label = round(Odds_Ratio, 2)),
hjust = 1,
size = 3) +
labs(title = "Impact of Factors on Price increase likelihood for Model 4: Positive Vs Negative Associations",
subtitle = "(Model 4: Continuous Variables from Model 3 without Industry Variable)",
x = "Factors",
y = "Odds Ratio\n(The exponentiation of coefficients)") +
geom_hline(yintercept = 1,
linetype = "dashed",
color = "red") +
annotate("text",
x = Inf,
y = 1,
label = "Odds Ratio = 1",
hjust = 0,
vjust = 0,
color = "grey",
size = 3.5,
fontface = "italic") +
theme_minimal() +
scale_fill_manual(
values = c("steelblue" = "steelblue", "coral" = "coral"),
labels = c("steelblue" = "Positively Associated with Price Increase",
"coral" = "Negatively Associated with Price Increase"),
name = "") +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
) Model 5: Same variables with Model 1 and with Type of security variable
# Select independent variable
X_logit_m5 <- task3 %>%
select(Mkt_beta, SMB_beta, HML_beta, log_spread, log_volatility, scaled_market_cap, log_tracking_error, log_turnover_ratio, security_typeETF)
y_logit_m5 <- task3$up_PRC
# Split the dataset into training and testing sets
set.seed(675)
trainIndex <- createDataPartition(y_logit_m5, p = 0.8, list = FALSE)
X_train_logit_m5 <- X_logit_m5[trainIndex, ]
X_test_logit_m5 <- X_logit_m5[-trainIndex, ]
y_train_logit_m5 <- y_logit_m5[trainIndex]
y_test_logit_m5 <- y_logit_m5[-trainIndex]
# Fit the logistic regression model
logit_model_5 <- glm(y_train_logit_m5 ~ .,
data = as.data.frame(X_train_logit_m5, y_train_logit_m5),
family = binomial())
# Display model summary
summary(logit_model_5)##
## Call:
## glm(formula = y_train_logit_m5 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m5,
## y_train_logit_m5))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.29389 0.72950 3.144 0.00166 **
## Mkt_beta -82.15234 12.30411 -6.677 2.44e-11 ***
## SMB_beta -2.70889 7.68745 -0.352 0.72455
## HML_beta -1.55521 9.92702 -0.157 0.87551
## log_spread -0.48394 0.09657 -5.011 5.41e-07 ***
## log_volatility -0.22405 0.11883 -1.885 0.05938 .
## scaled_market_cap -0.09525 0.29013 -0.328 0.74269
## log_tracking_error 2.60729 0.22094 11.801 < 2e-16 ***
## log_turnover_ratio 0.01454 0.09322 0.156 0.87606
## security_typeETF 1.44861 0.34167 4.240 2.24e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1432.02 on 4151 degrees of freedom
## Residual deviance: 993.57 on 4142 degrees of freedom
## AIC: 1013.6
##
## Number of Fisher Scoring iterations: 8
# Calculate predicted probabilities on testing data
pred_prob_m5 <- predict(logit_model_5,
newdata = as.data.frame(X_test_logit_m5),
type = "response")
# Determine classification based on threshold of 0.5
class_m5 <- ifelse(pred_prob_m5 > 0.5, 1, 0)
# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m5 <- sum(y_test_logit_m5 == 1 & class_m5 == 1)
true_negatives_m5 <- sum(y_test_logit_m5 == 0 & class_m5 == 0)
false_positives_m5 <- sum(y_test_logit_m5 == 0 & class_m5 == 1)
false_negatives_m5 <- sum(y_test_logit_m5 == 1 & class_m5 == 0)
# Calculate Accuracy
accuracy_m5 <- (true_positives_m5 + true_negatives_m5) / (true_positives_m5 + true_negatives_m5 + false_positives_m5 + false_negatives_m5)
# Calculate Precision
precision_m5 <- ifelse((true_positives_m5 + false_positives_m5) == 0, 0, true_positives_m5 / (true_positives_m5 + false_positives_m5))
# Calculate Recall
recall_m5 <- ifelse((true_positives_m5 + false_negatives_m5) == 0, 0, true_positives_m5 / (true_positives_m5 + false_negatives_m5))
# Calculate F1 Score
f1_m5 <- 2 * (precision_m5 * recall_m5) / (precision_m5 + recall_m5)
# Compute ROC curve and AUC
roc_curve_m5 <- roc(y_test_logit_m5, pred_prob_m5)
auc_value_m5 <- auc(roc_curve_m5)
# Calculate McFadden’s R-squared
null_model_5 <- glm(y_train_logit_m5 ~ 1, data = as.data.frame(cbind(X_train_logit_m5, y_train_logit_m5)), family = binomial())
rsquared_logit_m5 <- 1 - (logLik(logit_model_5) / logLik(null_model_5))
# Create a dataframe for counts
metrics_counts_logit_m5 <- data.frame(
Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
Count = c(
as.integer(true_positives_m5),
as.integer(true_negatives_m5),
as.integer(false_positives_m5),
as.integer(false_negatives_m5)
)
)
# Create a dataframe for performance metrics
metrics_evaluation_logit_m5 <- data.frame(
Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
Value = c(
accuracy_m5,
precision_m5,
recall_m5,
f1_m5,
auc_value_m5,
rsquared_logit_m5
)
)
# Print the results
print(metrics_counts_logit_m5)## Metric Count
## 1 True Positives 9
## 2 True Negatives 1011
## 3 False Positives 2
## 4 False Negatives 16
## Metric Value
## 1 Accuracy 0.9826590
## 2 Precision 0.8181818
## 3 Recall 0.3600000
## 4 F1 Score 0.5000000
## 5 AUC 0.9130109
## 6 R_squared 0.3061749
Confusion Matrix
true_false_m5 <- matrix(c(true_negatives_m5, false_positives_m5, false_negatives_m5, true_positives_m5),
nrow = 2,
dimnames = list(Predicted = c("Negative", "Positive"),
Actual = c("Negative", "Positive")))
conf_matrix_m5 <- as.data.frame(as.table(true_false_m5))
# Plot the confusion matrix
palette <- brewer.pal(n = 4,
name = "Greens") # Define color
conf_matrix_m5_plot <- ggplot(conf_matrix_m5,
aes(x = Predicted,
y = Actual,
fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq),
color = "black",
size = 6) +
scale_fill_gradientn(colors = brewer.pal(n = 4,
name = "Greens"),
guide = "none") +
labs(title = "Confusion Matrix for Model 5: Price Increase Predictions between February 14, 2020 to March 20, 2020",
subtitle = "(Model 5: Same variables with Model 1 and with Type of security variable)",
x = "Predicted",
y = "Actual") +
theme_minimal() +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
print(conf_matrix_m5_plot) Important Factor
# Extract coefficients
coefficients_logit_m5 <- coef(logit_model_5)
# Convert coefficients to odds ratios
odds_ratios_m5 <- exp(coefficients_logit_m5)
# Identify intercept and filter it out
importance_logit_m5 <- data.frame(
Factors = names(coefficients_logit_m5),
Odds_Ratio = odds_ratios_m5
)
# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m5 <- importance_logit_m5[importance_logit_m5$Factor != "(Intercept)", ]
# Add a column for fill color based on the odds ratio
importance_logit_m5$Fill_Color <- ifelse(importance_logit_m5$Odds_Ratio > 1, "steelblue", "coral")
# Plot the factor
ggplot(importance_logit_m5,
aes(x = reorder(Factors, Odds_Ratio),
y = Odds_Ratio,
fill = Fill_Color)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_text(aes(label = round(Odds_Ratio, 2)),
hjust = 1,
size = 3) +
labs(title = "Impact of Factors on Price increase likelihood for Model 5: Positive Vs Negative Associations",
subtitle = "(Model 5: Continuous Variables from Model 1 with type of security variable)",
x = "Factors",
y = "Odds Ratio\n(The exponentiation of coefficients)") +
geom_hline(yintercept = 1,
linetype = "dashed",
color = "red") +
annotate("text",
x = Inf,
y = 1,
label = "Odds Ratio = 1",
hjust = 0,
vjust = 0,
color = "grey",
size = 3.5,
fontface = "italic") +
theme_minimal() +
scale_fill_manual(
values = c("steelblue" = "steelblue", "coral" = "coral"),
labels = c("steelblue" = "Positively Associated with Price Increase",
"coral" = "Negatively Associated with Price Increase"),
name = "") +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
) Model 6: Same variables with Model 3 and with Type of security variable
# Select independent variable
X_logit_m6 <- task3 %>%
select(Mkt_beta, SMB_beta, scaled_dollar_vol, log_spread, log_volatility, log_tracking_error, log_turnover_ratio, SECTORMaterials, SECTORIndustrials, SECTORFinancials, SECTORInformation_Technology, SECTORConsumer_Discretionary, SECTORHealth_Care, SECTOREnergy, SECTORUtilities, SECTORReal_Estate, SECTORCommunication_Services, SECTORUnknown, security_typeETF)
y_logit_m6 <- task3$up_PRC
# Split the dataset into training and testing sets
set.seed(675)
trainIndex <- createDataPartition(y_logit_m6, p = 0.8, list = FALSE)
X_train_logit_m6 <- X_logit_m6[trainIndex, ]
X_test_logit_m6 <- X_logit_m6[-trainIndex, ]
y_train_logit_m6 <- y_logit_m6[trainIndex]
y_test_logit_m6 <- y_logit_m6[-trainIndex]
# Fit the logistic regression model
logit_model_6 <- glm(y_train_logit_m6 ~ .,
data = as.data.frame(X_train_logit_m6, y_train_logit_m6),
family = binomial())
# Display model summary
summary(logit_model_6)##
## Call:
## glm(formula = y_train_logit_m6 ~ ., family = binomial(), data = as.data.frame(X_train_logit_m6,
## y_train_logit_m6))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.16952 0.88471 4.713 2.44e-06 ***
## Mkt_beta -79.81355 12.37080 -6.452 1.11e-10 ***
## SMB_beta -4.09127 8.04088 -0.509 0.610887
## scaled_dollar_vol -0.17401 0.24324 -0.715 0.474380
## log_spread -0.49625 0.10032 -4.947 7.55e-07 ***
## log_volatility -0.24133 0.11928 -2.023 0.043059 *
## log_tracking_error 2.70666 0.23395 11.569 < 2e-16 ***
## log_turnover_ratio -0.01594 0.09444 -0.169 0.865986
## SECTORMaterials -1.94865 0.70959 -2.746 0.006029 **
## SECTORIndustrials -3.32320 0.82387 -4.034 5.49e-05 ***
## SECTORFinancials -2.56701 0.66797 -3.843 0.000122 ***
## SECTORInformation_Technology -2.49134 0.64171 -3.882 0.000103 ***
## SECTORConsumer_Discretionary -3.65068 1.05825 -3.450 0.000561 ***
## SECTORHealth_Care -1.37650 0.41603 -3.309 0.000937 ***
## SECTOREnergy -2.25860 0.92833 -2.433 0.014975 *
## SECTORUtilities -1.88301 1.07049 -1.759 0.078574 .
## SECTORReal_Estate -15.69765 849.22171 -0.018 0.985252
## SECTORCommunication_Services -16.01693 422.86756 -0.038 0.969786
## SECTORUnknown -1.72958 0.39903 -4.334 1.46e-05 ***
## security_typeETF 1.41723 0.39070 3.627 0.000286 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1432.02 on 4151 degrees of freedom
## Residual deviance: 944.25 on 4132 degrees of freedom
## AIC: 984.25
##
## Number of Fisher Scoring iterations: 16
# Calculate predicted probabilities on testing data
pred_prob_m6 <- predict(logit_model_6,
newdata = as.data.frame(X_test_logit_m6),
type = "response")
# Determine classification based on threshold of 0.5
class_m6 <- ifelse(pred_prob_m6 > 0.5, 1, 0)
# Calculate True Positives, True Negatives, False Positives, False Negatives
true_positives_m6 <- sum(y_test_logit_m6 == 1 & class_m6 == 1)
true_negatives_m6 <- sum(y_test_logit_m6 == 0 & class_m6 == 0)
false_positives_m6 <- sum(y_test_logit_m6 == 0 & class_m6 == 1)
false_negatives_m6 <- sum(y_test_logit_m6 == 1 & class_m6 == 0)
# Calculate Accuracy
accuracy_m6 <- (true_positives_m6 + true_negatives_m6) / (true_positives_m6 + true_negatives_m6 + false_positives_m6 + false_negatives_m6)
# Calculate Precision
precision_m6 <- ifelse((true_positives_m6 + false_positives_m6) == 0, 0, true_positives_m6 / (true_positives_m6 + false_positives_m6))
# Calculate Recall
recall_m6 <- ifelse((true_positives_m6 + false_negatives_m6) == 0, 0, true_positives_m6 / (true_positives_m6 + false_negatives_m6))
# Calculate F1 Score
f1_m6 <- 2 * (precision_m6 * recall_m6) / (precision_m6 + recall_m6)
# Compute ROC curve and AUC
roc_curve_m6 <- roc(y_test_logit_m6, pred_prob_m6)
auc_value_m6 <- auc(roc_curve_m6)
# Calculate McFadden’s R-squared
null_model_6 <- glm(y_train_logit_m6 ~ 1, data = as.data.frame(cbind(X_train_logit_m6, y_train_logit_m6)), family = binomial())
rsquared_logit_m6 <- 1 - (logLik(logit_model_6) / logLik(null_model_6))
# Create a dataframe for counts
metrics_counts_logit_m6 <- data.frame(
Metric = c('True Positives', 'True Negatives', 'False Positives', 'False Negatives'),
Count = c(
as.integer(true_positives_m6),
as.integer(true_negatives_m6),
as.integer(false_positives_m6),
as.integer(false_negatives_m6)
)
)
# Create a dataframe for performance metrics
metrics_evaluation_logit_m6 <- data.frame(
Metric = c('Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'R_squared'),
Value = c(
accuracy_m6,
precision_m6,
recall_m6,
f1_m6,
auc_value_m6,
rsquared_logit_m6
)
)
# Print the results
print(metrics_counts_logit_m6)## Metric Count
## 1 True Positives 11
## 2 True Negatives 1010
## 3 False Positives 3
## 4 False Negatives 14
## Metric Value
## 1 Accuracy 0.9836224
## 2 Precision 0.7857143
## 3 Recall 0.4400000
## 4 F1 Score 0.5641026
## 5 AUC 0.8898322
## 6 R_squared 0.3406185
Confusion Matrix
true_false_m6 <- matrix(c(true_negatives_m6, false_positives_m6, false_negatives_m6, true_positives_m6),
nrow = 2,
dimnames = list(Predicted = c("Negative", "Positive"),
Actual = c("Negative", "Positive")))
conf_matrix_m6 <- as.data.frame(as.table(true_false_m6))
# Plot the confusion matrix
palette <- brewer.pal(n = 4,
name = "Greens") # Define color
conf_matrix_m6_plot <- ggplot(conf_matrix_m6,
aes(x = Predicted,
y = Actual,
fill = Freq)) +
geom_tile(color = "black") +
geom_text(aes(label = Freq),
color = "black",
size = 6) +
scale_fill_gradientn(colors = brewer.pal(n = 4,
name = "Greens"),
guide = "none") +
labs(title = "Confusion Matrix for Model 6: Price Increase Predictions between February 14, 2020 to March 20, 2020",
subtitle = "(Model 6: Same variables with Model 3 and with Type of security variable)",
x = "Predicted",
y = "Actual") +
theme_minimal() +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
print(conf_matrix_m6_plot) Important Factor
# Extract coefficients
coefficients_logit_m6 <- coef(logit_model_6)
# Convert coefficients to odds ratios
odds_ratios_m6 <- exp(coefficients_logit_m6)
# Identify intercept and filter it out
importance_logit_m6 <- data.frame(
Factors = names(coefficients_logit_m6),
Odds_Ratio = odds_ratios_m6
)
# Exclude the intercept (assuming it's named 'Intercept')
importance_logit_m6 <- importance_logit_m6[importance_logit_m6$Factor != "(Intercept)", ]
# Add a column for fill color based on the odds ratio
importance_logit_m6$Fill_Color <- ifelse(importance_logit_m6$Odds_Ratio > 1, "steelblue", "coral")
# Plot with positive and negative impacts
# Plot the factor
ggplot(importance_logit_m6,
aes(x = reorder(Factors, Odds_Ratio),
y = Odds_Ratio,
fill = Fill_Color)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_text(aes(label = round(Odds_Ratio, 2)),
hjust = 1,
size = 3) +
labs(title = "Impact of Factors on Price increase likelihood for Model 5: Positive Vs Negative Associations",
subtitle = "(Model 6: Same variables with Model 3 and with Type of security variable)",
x = "Factors",
y = "Odds Ratio\n(The exponentiation of coefficients)") +
geom_hline(yintercept = 1,
linetype = "dashed",
color = "red") +
annotate("text",
x = Inf,
y = 1,
label = "Odds Ratio = 1",
hjust = 0,
vjust = 0,
color = "grey",
size = 3.5,
fontface = "italic") +
theme_minimal() +
scale_fill_manual(
values = c("steelblue" = "steelblue", "coral" = "coral"),
labels = c("steelblue" = "Positively Associated with Price Increase",
"coral" = "Negatively Associated with Price Increase"),
name = "") +
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
axis.title.y = element_text(color = 'black'),
axis.text.y = element_text(color = 'black'),
legend.position = 'bottom'
) Model variables difference
logit_m1_v <- get_variables(logit_model_1)
logit_m2_v <- get_variables(logit_model_2)
logit_m3_v <- get_variables(logit_model_3)
logit_m4_v <- get_variables(logit_model_4)
logit_m5_v <- get_variables(logit_model_5)
logit_m6_v <- get_variables(logit_model_6)
logit_v_list <- list(
Model_1 = as.character(logit_m1_v),
Model_2 = as.character(logit_m2_v),
Model_3 = as.character(logit_m3_v),
Model_4 = as.character(logit_m4_v),
Model_5 = as.character(logit_m5_v),
Model_6 = as.character(logit_m6_v)
)
# Initialise empty data frame
logit_model_data <- data.frame(
model = character(),
variable = character(),
order = integer(),
stringsAsFactors = FALSE
)
# Populate the dataframe with the original order of variables
for (model in names(logit_v_list)) {
variables <- logit_v_list[[model]] # Get the variables for each model
temp_df <- data.frame(
model = model,
variable = variables,
order = seq_along(variables),
stringsAsFactors = FALSE
)
logit_model_data <- rbind(logit_model_data, temp_df)
}
# Convert variable to a factor with levels
logit_model_data$variable <- factor(logit_model_data$variable, levels = unique(logit_model_data$variable[order(logit_model_data$order)]))
# Generate the Plot
logit_models_v <- ggplot(logit_model_data,
aes(x = model,
y = variable,
fill = model)) +
geom_tile(color = "white") +
labs(
title = "Variables Included in Each Logistic Model",
x = "Model",
y = "Variables"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45,
hjust = 1,
size = 12),
axis.text.y = element_text(size = 12),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
plot.title = element_text(hjust = 0.5,
size = 16),
plot.title.position = "plot")
print(logit_models_v)4 K-mean Clustering
# Import data
Stock_data_2 <- read.csv("Stock_data_part2.csv")
# Make sure all the "date" are converted to ISO 8601 format
Stock_data_2$public_date <- as.Date(Stock_data_2$public_date, format = "%d/%m/%Y")
Stock_data_2$public_date <- format(Stock_data_2$public_date, format = "%Y-%m-%d")
Stock_data_2$public_date <- as.Date(Stock_data_2$public_date, format = "%Y-%m-%d")
# Inspect data
str(Stock_data_2)## 'data.frame': 577832 obs. of 13 variables:
## $ public_date: Date, format: "2010-01-31" "2010-02-28" ...
## $ CAPEI : num 17.9 17.6 19.3 18.3 14.8 ...
## $ bm : num 1.077 1.077 1.077 0.898 0.898 ...
## $ evm : num 7.98 7.98 7.98 9.29 9.29 ...
## $ pe_exi : num 16.4 16.1 17.6 19.3 15.6 ...
## $ dpr : num 0 0 0 0 0 0 0 0 0 0 ...
## $ npm : num 0.041 0.041 0.041 0.037 0.037 0.037 0.033 0.033 0.033 0.037 ...
## $ roa : num 0.095 0.095 0.095 0.086 0.086 0.086 0.093 0.093 0.093 0.078 ...
## $ roe : num 0.079 0.079 0.079 0.067 0.067 0.067 0.059 0.059 0.059 0.064 ...
## $ roce : num 0.099 0.099 0.099 0.097 0.097 0.097 0.083 0.083 0.083 0.097 ...
## $ ptb : num 1.152 1.128 1.237 1.197 0.968 ...
## $ divyield : chr "" "" "" "" ...
## $ TICKER : chr "AIR" "AIR" "AIR" "AIR" ...
## public_date CAPEI bm evm pe_exi dpr npm roa roe roce ptb
## 1 2010-01-31 17.934 1.077 7.981 16.433 0 0.041 0.095 0.079 0.099 1.152
## 2 2010-02-28 17.554 1.077 7.981 16.085 0 0.041 0.095 0.079 0.099 1.128
## 3 2010-03-31 19.257 1.077 7.981 17.603 0 0.041 0.095 0.079 0.099 1.237
## 4 2010-04-30 18.304 0.898 9.293 19.349 0 0.037 0.086 0.067 0.097 1.197
## 5 2010-05-31 14.791 0.898 9.293 15.635 0 0.037 0.086 0.067 0.097 0.968
## divyield TICKER
## 1 AIR
## 2 AIR
## 3 AIR
## 4 AIR
## 5 AIR
Select the two most important variables for clustering
# Create a cross-sectional dataset
task4_select_ratios <- na.omit(
Stock_data_2 %>%
filter(TICKER != "") %>%
group_by(TICKER) %>%
summarise(
TICKER = first(TICKER),
CAPEI = median(CAPEI, na.rm = TRUE),
bm = median(bm, na.rm = TRUE),
evm = median(evm, na.rm = TRUE),
pe_exi = median(pe_exi, na.rm = TRUE),
dpr = median(dpr, na.rm = TRUE),
npm = median(npm, na.rm = TRUE),
roa = median(roa, na.rm = TRUE),
roe = median(roe, na.rm = TRUE),
roce = median(roce, na.rm = TRUE),
ptb = median(ptb, na.rm = TRUE),
divyield = median(as.numeric(sub("%", "", divyield)), na.rm = TRUE),
.groups = "drop"
)) %>%
select(-TICKER)
# Random Forest Importance
task4_rf_model <- randomForest(x = task4_select_ratios, y = NULL, ntree = 100)
task4_select_ratios_rf <- importance(task4_rf_model)
task4_rf_df <- data.frame(Feature = rownames(task4_select_ratios_rf),
MeanDecreaseGini = task4_select_ratios_rf[, "MeanDecreaseGini"])
# Plot the result
task4_select <- ggplot(task4_rf_df,
aes(x = reorder(Feature, MeanDecreaseGini),
y = MeanDecreaseGini)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() +
labs(title = "Feature Importance of financial ratio",
x = "Financial Ratio",
y = "Mean Decrease in Gini") +
theme_minimal()+
theme(
plot.title = element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
print(task4_select)Create function to transform dataset to cross-sectional
task4_dataset_transformation <- function(data, filter_date, output_name_cleaned, output_name_clustering) {
aggregated_data <- data %>%
filter(TICKER != "") %>%
filter(public_date < filter_date) %>%
group_by(TICKER) %>%
summarise(
TICKER = first(TICKER),
# get the median (avoid outlier affect the result)
bm = median(bm, na.rm = TRUE),
roe = median(roe, na.rm = TRUE),
.groups = "drop"
)
print(head(aggregated_data, n = 5))
# Remove outliers for B/M ratio by IQR method
Q1_BM <- quantile(aggregated_data$bm, 0.25, na.rm = TRUE)
Q3_BM <- quantile(aggregated_data$bm, 0.75, na.rm = TRUE)
IQR_BM <- Q3_BM - Q1_BM
cleaned_data <- aggregated_data[aggregated_data$bm > (Q1_BM - 1.5 * IQR_BM) &
aggregated_data$bm < (Q3_BM + 1.5 * IQR_BM), ]
# Remove outliers for ROE with IQR method
Q1_ROE <- quantile(cleaned_data$roe, 0.25, na.rm = TRUE)
Q3_ROE <- quantile(cleaned_data$roe, 0.75, na.rm = TRUE)
IQR_ROE <- Q3_ROE - Q1_ROE
cleaned_data <- cleaned_data[cleaned_data$roe > (Q1_ROE - 1.5 * IQR_ROE) &
cleaned_data$roe < (Q3_ROE + 1.5 * IQR_ROE), ]
# Join with returns data - calculate return using the price of the last day and the first day from the period to get the % of price change
cluster_data <- cleaned_data %>%
left_join(task2 %>%
filter(date >= "2019-12-14" & date <= "2020-01-20") %>%
group_by(TICKER) %>%
summarise(
noncovid_RET = (PRC[which.max(date)] - PRC[which.min(date)]) / PRC[which.min(date)]),
by = "TICKER") %>%
left_join(task2 %>%
filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
group_by(TICKER) %>%
summarise(
COVID_RET = (PRC[which.max(date)] - PRC[which.min(date)]) / PRC[which.min(date)]),
by = "TICKER") %>%
left_join(task2 %>%
filter(date >= "2019-12-14" & date <= "2020-01-20") %>%
group_by(TICKER) %>%
summarise(
noncovid_sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
noncovid_volatility = mean(volatility, na.rm = TRUE)
),
by = "TICKER") %>%
left_join(task2 %>%
filter(date >= "2020-02-14" & date <= "2020-03-20") %>%
group_by(TICKER) %>%
summarise(
covid_sharpe_ratio = mean(sharpe_ratio, na.rm = TRUE),
covid_volatility = mean(volatility, na.rm = TRUE)
),
by = "TICKER") %>%
# Remove missing value
filter(!is.na(bm) & !is.na(roe)) %>%
# Standardise variable to ensure they are on same scale
mutate(
bm_scaled = scale(bm),
roe_scaled = scale(roe)
)
print(head(cluster_data, n = 5))
# Select scaled variables
output_data <- cluster_data[, c("bm_scaled", "roe_scaled")]
print(head(output_data, n = 5))
# Assign output name
assign(output_name_cleaned, cluster_data, envir = .GlobalEnv)
assign(output_name_clustering, output_data, envir = .GlobalEnv)
}Call function to create dataset
# Entire Period
task4_dataset_transformation(Stock_data_2, "2022-12-31", "task4_entire", "task4_cluster_entire")## # A tibble: 5 × 3
## TICKER bm roe
## <chr> <dbl> <dbl>
## 1 A 0.266 0.149
## 2 AA 0.927 0.001
## 3 AAC 0.712 -0.006
## 4 AACC 1.13 0.005
## 5 AACQ 0.698 -0.029
## # A tibble: 5 × 11
## TICKER bm roe noncovid_RET COVID_RET noncovid_sharpe_ratio
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 0.266 0.149 0.0671 -0.226 -175.
## 2 AA 0.927 0.001 -0.157 -0.650 -82.5
## 3 AAC 0.712 -0.006 NA NA NA
## 4 AACC 1.13 0.005 NA NA NA
## 5 AACQ 0.698 -0.029 NA NA NA
## # ℹ 5 more variables: noncovid_volatility <dbl>, covid_sharpe_ratio <dbl>,
## # covid_volatility <dbl>, bm_scaled <dbl[,1]>, roe_scaled <dbl[,1]>
## # A tibble: 5 × 2
## bm_scaled[,1] roe_scaled[,1]
## <dbl> <dbl>
## 1 -0.874 0.723
## 2 0.968 0.169
## 3 0.369 0.143
## 4 1.53 0.184
## 5 0.330 0.0569
# Non-COVID period
task4_dataset_transformation(Stock_data_2, "2019-12-14", "task4_noncovid", "task4_cluster_noncovid")## # A tibble: 5 × 3
## TICKER bm roe
## <chr> <dbl> <dbl>
## 1 A 0.29 0.133
## 2 AA 0.961 0.004
## 3 AAC 0.712 -0.006
## 4 AACC 1.13 0.005
## 5 AAI 0.714 0.113
## # A tibble: 5 × 11
## TICKER bm roe noncovid_RET COVID_RET noncovid_sharpe_ratio
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 0.29 0.133 0.0671 -0.226 -175.
## 2 AA 0.961 0.004 -0.157 -0.650 -82.5
## 3 AAC 0.712 -0.006 NA NA NA
## 4 AACC 1.13 0.005 NA NA NA
## 5 AAI 0.714 0.113 NA NA NA
## # ℹ 5 more variables: noncovid_volatility <dbl>, covid_sharpe_ratio <dbl>,
## # covid_volatility <dbl>, bm_scaled <dbl[,1]>, roe_scaled <dbl[,1]>
## # A tibble: 5 × 2
## bm_scaled[,1] roe_scaled[,1]
## <dbl> <dbl>
## 1 -0.896 0.597
## 2 0.946 -0.0776
## 3 0.262 -0.130
## 4 1.41 -0.0723
## 5 0.268 0.492
# COVID period
task4_dataset_transformation(Stock_data_2, "2020-02-14", "task4_covid", "task4_cluster_covid")## # A tibble: 5 × 3
## TICKER bm roe
## <chr> <dbl> <dbl>
## 1 A 0.29 0.133
## 2 AA 0.961 0.001
## 3 AAC 0.712 -0.006
## 4 AACC 1.13 0.005
## 5 AAI 0.714 0.113
## # A tibble: 5 × 11
## TICKER bm roe noncovid_RET COVID_RET noncovid_sharpe_ratio
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 0.29 0.133 0.0671 -0.226 -175.
## 2 AA 0.961 0.001 -0.157 -0.650 -82.5
## 3 AAC 0.712 -0.006 NA NA NA
## 4 AACC 1.13 0.005 NA NA NA
## 5 AAI 0.714 0.113 NA NA NA
## # ℹ 5 more variables: noncovid_volatility <dbl>, covid_sharpe_ratio <dbl>,
## # covid_volatility <dbl>, bm_scaled <dbl[,1]>, roe_scaled <dbl[,1]>
## # A tibble: 5 × 2
## bm_scaled[,1] roe_scaled[,1]
## <dbl> <dbl>
## 1 -0.896 0.598
## 2 0.945 -0.0896
## 3 0.262 -0.126
## 4 1.41 -0.0687
## 5 0.267 0.494
Create function to generate Elbow plot
generate_elbow_plot <- function(find_k_data, elbow_plot_name) {
inertia_cleaned <- numeric(10)
for (n in 1:10) {
k_mean_cleaned <- tryCatch({
kmeans(find_k_data, centers = n, nstart = 10)
}, error = function(e) {
return(NULL)
})
# Debugging
if (!is.null(k_mean_cleaned)) {
inertia_cleaned[n] <- k_mean_cleaned$tot.withinss
cat(sprintf("Cluster: %d, Inertia: %f\n", n, inertia_cleaned[n]))
} else {
cat(sprintf("An error occurred at n_clusters=%d\n", n))
}
}
# Debugging
if (length(inertia_cleaned) == 10) {
elbow_data_cleaned <- data.frame(
clusters_cleaned = 1:10,
inertia_cleaned = inertia_cleaned
)
# Generate the plot
elbow_plot <- ggplot(elbow_data_cleaned,
aes(x = clusters_cleaned,
y = inertia_cleaned)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red",
size = 4,
shape = 19) +
scale_x_continuous(breaks = 1:10) +
ggtitle('Elbow Method for Clustering') +
xlab('Number of Clusters') +
ylab('Inertia') +
theme_light() +
theme(
plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
print(elbow_plot)
# Save the plot for report
save_plot(elbow_plot, elbow_plot_name)
} else {
cat("There is some error.")
}
}Create function to generate cluster visualisation
generate_cluster_visualisation <- function(clustering_data, cluster_data, cluster_plot_name, period_for_title, centers, output_name) {
set.seed(42)
k_mean_cleaned <- kmeans(clustering_data, centers = centers, nstart = 10)
# Assign cluster to each stock
cluster_data$cluster <- factor(k_mean_cleaned$cluster)
# Convert centroids to data frame
centroids_cleaned <- as.data.frame(k_mean_cleaned$centers)
names(centroids_cleaned) <- c("bm_scaled", "roe_scaled")
# Plot the scatterplot with specified clusters
cluster_visual <- ggplot(cluster_data,
aes(x = bm_scaled,
y = roe_scaled,
color = cluster)) +
geom_point(size = 3) +
geom_point(data = centroids_cleaned,
aes(x = bm_scaled,
y = roe_scaled,
shape = "Centroid"),
color = 'red',
size = 6,
alpha = 1) +
scale_shape_manual(name = "",
values = c("Centroid" = 8)) +
labs(title = 'Cluster Visualisation: Market-to-Book ratio VS Return on Equity',
subtitle = period_for_title,
x = 'Standardised Book / Market ratio (bm)',
y = 'Standardised Return on Equity (roe)',
color = 'Cluster') +
theme(plot.title = element_text(size = 16,
face = "bold"),
plot.subtitle = element_text(size = 12)) +
scale_color_brewer(palette = "Pastel2") +
theme_minimal() +
theme(panel.grid.major = element_line(color = "gray90",
size = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
plot.title = element_text(size = 16,
face = "bold"),
plot.subtitle = element_text(size = 12))
print(cluster_visual)
# Save plot for report
save_plot(cluster_visual, cluster_plot_name)
# Assign output name
assign(output_name, cluster_data, envir = .GlobalEnv)
}Call function to generate elbow plot and cluster visualisation
## Cluster: 1, Inertia: 12512.000000
## Cluster: 2, Inertia: 7642.593033
## Cluster: 3, Inertia: 4098.499759
## Cluster: 4, Inertia: 3178.173036
## Cluster: 5, Inertia: 2560.094173
## Cluster: 6, Inertia: 2164.649512
## Cluster: 7, Inertia: 1819.718568
## Cluster: 8, Inertia: 1594.905004
## Cluster: 9, Inertia: 1431.724197
## Cluster: 10, Inertia: 1274.023383
generate_cluster_visualisation(task4_cluster_entire, task4_entire, "Task 4 Clustering (Entire)", "Aggregate from Entire Period", 3, "task4_entire")## Cluster: 1, Inertia: 9994.000000
## Cluster: 2, Inertia: 6534.440559
## Cluster: 3, Inertia: 3474.260447
## Cluster: 4, Inertia: 2633.705462
## Cluster: 5, Inertia: 2146.807255
## Cluster: 6, Inertia: 1791.804979
## Cluster: 7, Inertia: 1501.036253
## Cluster: 8, Inertia: 1305.638966
## Cluster: 9, Inertia: 1171.676067
## Cluster: 10, Inertia: 1058.081745
generate_cluster_visualisation(task4_cluster_noncovid, task4_noncovid, "Task 4 Clustering (Non-COVID)","Aggregate from Non-COVID Period", 3, "task4_noncovid")## Cluster: 1, Inertia: 10024.000000
## Cluster: 2, Inertia: 6542.387531
## Cluster: 3, Inertia: 3475.782907
## Cluster: 4, Inertia: 2637.769299
## Cluster: 5, Inertia: 2156.451101
## Cluster: 6, Inertia: 1800.543680
## Cluster: 7, Inertia: 1501.480959
## Cluster: 8, Inertia: 1303.700814
## Cluster: 9, Inertia: 1175.419993
## Cluster: 10, Inertia: 1062.107336
generate_cluster_visualisation(task4_cluster_covid, task4_covid, "Task 4 Clustering (COVID)","Aggregate from COVID Period", 3, "task4_covid")Cluster Characteristics
# Entire Period
Cluster_characteristics_entire <- task4_entire %>%
group_by(cluster) %>%
summarise(mean_bm = mean(bm, na.rm = TRUE),
mean_roe = mean(roe, na.rm = TRUE))
print(Cluster_characteristics_entire)## # A tibble: 3 × 3
## cluster mean_bm mean_roe
## <fct> <dbl> <dbl>
## 1 1 0.367 0.104
## 2 2 0.349 -0.491
## 3 3 0.969 0.00699
# Non-COVID Period
Cluster_characteristics_noncovid <- task4_noncovid %>%
group_by(cluster) %>%
summarise(mean_bm = mean(bm, na.rm = TRUE),
mean_roe = mean(roe, na.rm = TRUE))
print(Cluster_characteristics_noncovid)## # A tibble: 3 × 3
## cluster mean_bm mean_roe
## <fct> <dbl> <dbl>
## 1 1 0.387 -0.334
## 2 2 0.997 0.0231
## 3 3 0.394 0.126
# COVID period
Cluster_characteristics_covid <- task4_covid %>%
group_by(cluster) %>%
summarise(mean_bm = mean(bm, na.rm = TRUE),
mean_roe = mean(roe, na.rm = TRUE))
print(Cluster_characteristics_covid)## # A tibble: 3 × 3
## cluster mean_bm mean_roe
## <fct> <dbl> <dbl>
## 1 1 0.998 0.0233
## 2 2 0.394 0.126
## 3 3 0.386 -0.333
Average return by cluster for each period
# Create function to calculate average return by cluster for each period
calculate_returns_cluster <- function(cluster_data, output_prefix) {
output_covid <- cluster_data %>%
filter(!is.na(COVID_RET) & !is.na(noncovid_RET)) %>%
group_by(cluster) %>%
summarise(cluster_RET_covid = mean(COVID_RET, na.rm = TRUE))
output_noncovid <- cluster_data %>%
filter(!is.na(COVID_RET) & !is.na(noncovid_RET)) %>%
group_by(cluster) %>%
summarise(cluster_RET_noncovid = mean(noncovid_RET, na.rm = TRUE))
# Print the result
print(output_noncovid)
print(output_covid)
}
# Call function
calculate_returns_cluster(task4_entire, "entire")## # A tibble: 3 × 2
## cluster cluster_RET_noncovid
## <fct> <dbl>
## 1 1 0.0403
## 2 2 0.334
## 3 3 0.0606
## # A tibble: 3 × 2
## cluster cluster_RET_covid
## <fct> <dbl>
## 1 1 -0.380
## 2 2 -0.304
## 3 3 -0.375
## # A tibble: 3 × 2
## cluster cluster_RET_noncovid
## <fct> <dbl>
## 1 1 0.186
## 2 2 0.0739
## 3 3 0.0336
## # A tibble: 3 × 2
## cluster cluster_RET_covid
## <fct> <dbl>
## 1 1 -0.334
## 2 2 -0.381
## 3 3 -0.383
## # A tibble: 3 × 2
## cluster cluster_RET_noncovid
## <fct> <dbl>
## 1 1 0.0729
## 2 2 0.0334
## 3 3 0.184
## # A tibble: 3 × 2
## cluster cluster_RET_covid
## <fct> <dbl>
## 1 1 -0.380
## 2 2 -0.382
## 3 3 -0.342
Investment implications
# Function to normalise metrics and calculate composite scores
calculate_composite_scores <- function(data, selected_cluster, output_prefix) {
# Normalisation for Non-COVID period
normalised_data_noncovid <- data %>%
filter(cluster == selected_cluster) %>%
filter(!if_any(starts_with("noncovid_"), is.na)) %>%
mutate(
noncovid_RET_normalised = (noncovid_RET - min(noncovid_RET)) / (max(noncovid_RET) - min(noncovid_RET)),
noncovid_volatility_normalised = 1 - ((noncovid_volatility - min(noncovid_volatility)) / (max(noncovid_volatility) - min(noncovid_volatility))),
noncovid_sharpe_ratio_normalised = (noncovid_sharpe_ratio - min(noncovid_sharpe_ratio)) / (max(noncovid_sharpe_ratio) - min(noncovid_sharpe_ratio))
)
# Calculate scores based on different strategies for Non-COVID
strategies_noncovid <- list(
max_return_noncovid = list(weights = c(0.5, 0.3, 0.2),
columns = c("noncovid_RET_normalised",
"noncovid_sharpe_ratio_normalised",
"noncovid_volatility_normalised")),
risk_adjusted_noncovid = list(weights = c(0.3, 0.5, 0.2),
columns = c("noncovid_RET_normalised",
"noncovid_sharpe_ratio_normalised",
"noncovid_volatility_normalised")),
low_risk_noncovid = list(weights = c(0.2, 0.3, 0.5),
columns = c("noncovid_RET_normalised",
"noncovid_sharpe_ratio_normalised",
"noncovid_volatility_normalised"))
)
results_noncovid <- list()
for (strategy in names(strategies_noncovid)) {
weight_vector <- strategies_noncovid[[strategy]]$weights
score_columns <- strategies_noncovid[[strategy]]$columns
results_noncovid[[strategy]] <- normalised_data_noncovid %>%
mutate(
composite_score = rowSums(sweep(select(., all_of(score_columns)), 2, weight_vector, `*`))
) %>%
arrange(desc(composite_score)) %>%
select(TICKER, noncovid_RET_normalised, noncovid_volatility_normalised, noncovid_sharpe_ratio_normalised, composite_score)
# Print the top 3 securities
cat(paste("\nTop 3 for", strategy, ":\n"))
print(head(results_noncovid[[strategy]], 3))
}
# Normalisation for COVID period
normalised_data_covid <- data %>%
filter(cluster == selected_cluster) %>%
filter(!is.na(COVID_RET) & !if_any(starts_with("covid_"), is.na)) %>%
mutate(
covid_RET_normalised = (COVID_RET - min(COVID_RET)) / (max(COVID_RET) - min(COVID_RET)),
covid_volatility_normalised = 1 - ((covid_volatility - min(covid_volatility)) / (max(covid_volatility) - min(covid_volatility))),
covid_sharpe_ratio_normalised = (covid_sharpe_ratio - min(covid_sharpe_ratio)) / (max(covid_sharpe_ratio) - min(covid_sharpe_ratio))
)
# Calculate scores based on different strategies for COVID
strategies_covid <- list(
max_return_covid = list(weights = c(0.5, 0.3, 0.2),
columns = c("covid_RET_normalised",
"covid_sharpe_ratio_normalised",
"covid_volatility_normalised")),
risk_adjusted_covid = list(weights = c(0.3, 0.5, 0.2),
columns = c("covid_RET_normalised",
"covid_sharpe_ratio_normalised",
"covid_volatility_normalised")),
low_risk_covid = list(weights = c(0.2, 0.3, 0.5),
columns = c("covid_RET_normalised",
"covid_sharpe_ratio_normalised",
"covid_volatility_normalised"))
)
results_covid <- list()
for (strategy in names(strategies_covid)) {
weight_vector <- strategies_covid[[strategy]]$weights
score_columns <- strategies_covid[[strategy]]$columns
results_covid[[strategy]] <- normalised_data_covid %>%
mutate(
composite_score = rowSums(sweep(select(., all_of(score_columns)), 2, weight_vector, `*`))
) %>%
arrange(desc(composite_score)) %>%
select(TICKER, covid_RET_normalised, covid_volatility_normalised, covid_sharpe_ratio_normalised, composite_score)
# Print the top 3 securities
cat(paste("\nTop 3 for", strategy, ":\n"))
print(head(results_covid[[strategy]], 3))
}
}
# Call function
calculate_composite_scores(task4_entire, 3, "entire")##
## Top 3 for max_return_noncovid :
## # A tibble: 3 × 5
## TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
## <chr> <dbl> <dbl> <dbl>
## 1 IDXG 1 0.897 0.994
## 2 USEG 0.825 0.886 0.994
## 3 ASNA 0.726 0.887 0.995
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## # ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for risk_adjusted_noncovid :
## # A tibble: 3 × 5
## TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
## <chr> <dbl> <dbl> <dbl>
## 1 IDXG 1 0.897 0.994
## 2 USEG 0.825 0.886 0.994
## 3 ASNA 0.726 0.887 0.995
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## # ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for low_risk_noncovid :
## # A tibble: 3 × 5
## TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
## <chr> <dbl> <dbl> <dbl>
## 1 IDXG 1 0.897 0.994
## 2 USEG 0.825 0.886 0.994
## 3 ASNA 0.726 0.887 0.995
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## # ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for max_return_covid :
## # A tibble: 3 × 5
## TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
## <chr> <dbl> <dbl> <dbl>
## 1 AHPI 1 0.0210 0.999
## 2 PLAG 0.206 0.847 0.994
## 3 FTR 0.177 0.881 0.989
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for risk_adjusted_covid :
## # A tibble: 3 × 5
## TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
## <chr> <dbl> <dbl> <dbl>
## 1 AHPI 1 0.0210 0.999
## 2 PLAG 0.206 0.847 0.994
## 3 FTR 0.177 0.881 0.989
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for low_risk_covid :
## # A tibble: 3 × 5
## TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
## <chr> <dbl> <dbl> <dbl>
## 1 CIIC 0.104 0.997 0.969
## 2 HCCO 0.0987 0.981 0.988
## 3 AMHC 0.104 0.985 0.968
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for max_return_noncovid :
## # A tibble: 3 × 5
## TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
## <chr> <dbl> <dbl> <dbl>
## 1 PECK 1 0.757 0.990
## 2 NLS 0.917 0.798 0.988
## 3 LMB 0.621 0.847 0.980
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## # ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for risk_adjusted_noncovid :
## # A tibble: 3 × 5
## TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
## <chr> <dbl> <dbl> <dbl>
## 1 PECK 1 0.757 0.990
## 2 NLS 0.917 0.798 0.988
## 3 LMB 0.621 0.847 0.980
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## # ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for low_risk_noncovid :
## # A tibble: 3 × 5
## TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
## <chr> <dbl> <dbl> <dbl>
## 1 NLS 0.917 0.798 0.988
## 2 PECK 1 0.757 0.990
## 3 FPH 0.538 0.932 0.958
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## # ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for max_return_covid :
## # A tibble: 3 × 5
## TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
## <chr> <dbl> <dbl> <dbl>
## 1 AIM 1 0.228 0.997
## 2 UNFI 0.702 0.818 0.987
## 3 ZM 0.675 0.872 0.985
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for risk_adjusted_covid :
## # A tibble: 3 × 5
## TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
## <chr> <dbl> <dbl> <dbl>
## 1 ZM 0.675 0.872 0.985
## 2 UNFI 0.702 0.818 0.987
## 3 EQT 0.661 0.826 0.987
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for low_risk_covid :
## # A tibble: 3 × 5
## TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
## <chr> <dbl> <dbl> <dbl>
## 1 ZM 0.675 0.872 0.985
## 2 VIRT 0.551 0.919 0.975
## 3 KR 0.515 0.926 0.965
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for max_return_noncovid :
## # A tibble: 3 × 5
## TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
## <chr> <dbl> <dbl> <dbl>
## 1 NLS 1 0.798 0.988
## 2 LMB 0.677 0.847 0.980
## 3 AIM 0.636 0.826 0.985
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## # ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for risk_adjusted_noncovid :
## # A tibble: 3 × 5
## TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
## <chr> <dbl> <dbl> <dbl>
## 1 NLS 1 0.798 0.988
## 2 LMB 0.677 0.847 0.980
## 3 AIM 0.636 0.826 0.985
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## # ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for low_risk_noncovid :
## # A tibble: 3 × 5
## TICKER noncovid_RET_normalised noncovid_volatility_no…¹ noncovid_sharpe_rati…²
## <chr> <dbl> <dbl> <dbl>
## 1 NLS 1 0.798 0.988
## 2 FPH 0.586 0.932 0.958
## 3 VICR 0.493 0.949 0.946
## # ℹ abbreviated names: ¹​noncovid_volatility_normalised,
## # ²​noncovid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for max_return_covid :
## # A tibble: 3 × 5
## TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
## <chr> <dbl> <dbl> <dbl>
## 1 AIM 1 0.228 0.997
## 2 UNFI 0.704 0.818 0.987
## 3 ZM 0.677 0.872 0.985
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for risk_adjusted_covid :
## # A tibble: 3 × 5
## TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
## <chr> <dbl> <dbl> <dbl>
## 1 ZM 0.677 0.872 0.985
## 2 UNFI 0.704 0.818 0.987
## 3 EQT 0.663 0.826 0.987
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
##
## Top 3 for low_risk_covid :
## # A tibble: 3 × 5
## TICKER covid_RET_normalised covid_volatility_normalised covid_sharpe_ratio_n…¹
## <chr> <dbl> <dbl> <dbl>
## 1 ZM 0.677 0.872 0.985
## 2 VIRT 0.554 0.919 0.975
## 3 KR 0.518 0.926 0.965
## # ℹ abbreviated name: ¹​covid_sharpe_ratio_normalised
## # ℹ 1 more variable: composite_score <dbl>
5 Principal Component Analysis
Data Preparation
# Import data
Stock_data_3 <- read.csv("Stock_data_part3.csv")
sp500 <- read.csv("S&P500.csv")
long_term_rate <- read.csv("long-term-rates-2000-2023.csv")
# Make sure all the "date" are converted to ISO 8601 format
Stock_data_3$date <- as.Date(Stock_data_3$date, format = "%Y-%m-%d")
long_term_rate$date <- as.Date(long_term_rate$date, format = "%Y-%m-%d")
# Inspect data
str(Stock_data_3)## 'data.frame': 1176728 obs. of 12 variables:
## $ PERMNO: int 10001 10001 10001 10001 10001 10001 10001 10001 10001 10001 ...
## $ date : Date, format: "2010-01-29" "2010-02-26" ...
## $ SHRCD : int 11 11 11 11 11 11 11 11 11 11 ...
## $ TICKER: chr "EGAS" "EGAS" "EGAS" "EGAS" ...
## $ PERMCO: int 7953 7953 7953 7953 7953 7953 7953 7953 7953 7953 ...
## $ CUSIP : chr "36720410" "36720410" "36720410" "36720410" ...
## $ HSICIG: int NA NA NA NA NA NA NA NA NA NA ...
## $ PRC : num 10.1 10 10.2 11.4 11.4 ...
## $ VOL : int 3104 1510 2283 3350 3451 3537 2858 2595 1591 1803 ...
## $ RET : chr "-0.018932" "-0.000656" "0.020643" "0.124385" ...
## $ SHROUT: int 4361 4361 4361 6070 6071 6080 6080 6073 6073 6074 ...
## $ SPREAD: num NA NA NA NA NA NA NA NA NA NA ...
## PERMNO date SHRCD TICKER PERMCO CUSIP HSICIG PRC VOL RET
## 1 10001 2010-01-29 11 EGAS 7953 36720410 NA 10.0600 3104 -0.018932
## 2 10001 2010-02-26 11 EGAS 7953 36720410 NA 10.0084 1510 -0.000656
## 3 10001 2010-03-31 11 EGAS 7953 36720410 NA 10.1700 2283 0.020643
## 4 10001 2010-04-30 11 EGAS 7953 36720410 NA 11.3900 3350 0.124385
## 5 10001 2010-05-28 11 EGAS 7953 36720410 NA 11.4000 3451 0.004829
## SHROUT SPREAD
## 1 4361 NA
## 2 4361 NA
## 3 4361 NA
## 4 6070 NA
## 5 6071 NA
## Symbol Name Sector
## 1 MMM 3M Co. Industrials
## 2 ACE ACE Limited Financials
## 3 ABT Abbott Laboratories Health Care
## 4 ANF Abercrombie & Fitch Company A Consumer Discretionary
## 5 ACN Accenture Information Technology
# Data wrangling
sp500 <- sp500 %>%
mutate(Symbol = case_when(
Symbol == "BRK.B" ~ "BRK",
Symbol == "BF.B" ~ "BF",
TRUE ~ Symbol
))# Data cleaning: inspect missing value and extract month for later dataset join
task5_dataset1 <- Stock_data_3 %>%
mutate(
PRC = abs(PRC),
PRC = ifelse(PRC == 0.0, NA, PRC),
VOL = ifelse(VOL == -99, NA, VOL),
RET = case_when(
RET %in% c(-44, -55, -66, -77, -88, -99) ~ NA,
RET %in% c(".A", ".B", ".C", ".D") ~ NA,
TRUE ~ as.numeric(as.character(RET))
),
SPREAD = ifelse(SPREAD <= 0, NA, SPREAD),
month_year = as.yearmon(ymd(date))
) %>%
# Remove duplicate
distinct(PERMNO, date, .keep_all = TRUE) %>%
mutate(
dollar_vol = PRC * VOL
) %>%
# Retain only S&P500 stock
filter(TICKER %in% sp500$Symbol)
# Create month variable for left join and retain only specific date value, then convert to monthly data
long_term_rate <- long_term_rate %>%
mutate(
month_year = as.yearmon(ymd(date))
) %>%
filter(month_year >= "Jan 2010" & month_year <= "Dec 2022") %>%
group_by(month_year) %>%
summarise(
long_term_rate = mean(long_term_rate, na.rm = TRUE)
)
# Same process as long term rate
short_term_rate <- risk_free_rate %>%
mutate(
month_year = as.yearmon(ymd(date))
) %>%
rename(short_term_rate = risk_free_rate) %>%
filter(month_year >= "Jan 2010" & month_year <= "Dec 2022") %>%
group_by(month_year) %>%
summarise(
short_term_rate = mean(short_term_rate, na.rm = TRUE)
)
# Similar process as previous
FFF_monthly <- fama_french_factors %>%
mutate(
month_year = as.yearmon(ymd(date))
) %>%
filter(month_year >= "Jan 2010" & month_year <= "Dec 2022") %>%
group_by(month_year) %>%
summarise(
Mkt = mean(Mkt.RF, na.rm = TRUE),
SMB = mean(SMB, na.rm = TRUE),
HML = mean(HML, na.rm = TRUE)
)
# Process the financial ratio dataset: divyield have a lot of missing value, better not to include it
financial_ratio <- Stock_data_2 %>%
filter(TICKER != "") %>%
mutate(
month_year = as.yearmon(ymd(public_date))) %>%
select(-divyield, -public_date)
# Join the financial ratio dataset
task5_dataset2 <- task5_dataset1 %>%
left_join(
financial_ratio,
by = c("month_year", "TICKER")
)
# Join all the other datasets
task5_dataset3 <- task5_dataset2 %>%
left_join(
long_term_rate,
by = "month_year"
) %>%
left_join(
short_term_rate,
by = "month_year"
) %>%
left_join(
FFF_monthly,
by = "month_year"
)
# Inspect the latest dataset
head(task5_dataset3, n = 5)## PERMNO date SHRCD TICKER PERMCO CUSIP HSICIG PRC VOL
## 1 10104 2010-01-29 11 ORCL 8045 68389X10 NA 23.0600 6068156
## 2 10104 2010-02-26 11 ORCL 8045 68389X10 NA 24.6500 5771538
## 3 10104 2010-03-31 11 ORCL 8045 68389X10 NA 25.7100 6618577
## 4 10104 2010-04-30 11 ORCL 8045 68389X10 NA 25.8675 5580407
## 5 10104 2010-05-28 11 ORCL 8045 68389X10 NA 22.5700 7406752
## RET SHROUT SPREAD month_year dollar_vol CAPEI bm evm pe_exi
## 1 -0.057888 5011220 NA Jan 2010 139931677 26.386 0.252 10.110 20.052
## 2 0.068951 5015000 NA Feb 2010 142268412 28.226 0.252 10.110 21.435
## 3 0.043002 5019091 NA Mar 2010 170163615 29.464 0.252 10.110 22.357
## 4 0.008071 5029523 NA Apr 2010 144351178 28.783 0.233 10.983 23.096
## 5 -0.127477 5026000 NA May 2010 167170393 25.096 0.233 10.983 20.152
## dpr npm roa roe roce ptb long_term_rate short_term_rate
## 1 0.129 0.250 0.232 0.232 0.250 4.145 4.318421 0.3231579
## 2 0.129 0.250 0.232 0.232 0.250 4.434 4.305263 0.3263158
## 3 0.129 0.250 0.232 0.232 0.250 4.628 4.332174 0.3726087
## 4 0.177 0.234 0.227 0.218 0.242 4.515 4.389545 0.4190909
## 5 0.177 0.234 0.227 0.218 0.242 3.937 3.990000 0.3520000
## Mkt SMB HML
## 1 -0.17578947 0.02263158 0.02736842
## 2 0.18315789 0.05947368 0.16842105
## 3 0.26652174 0.06086957 0.09260870
## 4 0.09904762 0.22285714 0.13761905
## 5 -0.39000000 0.01450000 -0.11500000
# Transform to time series dataset (after trying different aggregation methods, take the average value may be the best)
task5_dataset4 <- task5_dataset3 %>%
group_by(date) %>%
summarise(
dollar_vol = mean(dollar_vol, na.rm = TRUE),
RET = mean(RET, na.rm = TRUE),
SHROUT = mean(SHROUT, na.rm = TRUE),
long_term_rate = mean(long_term_rate, na.rm = TRUE),
short_term_rate = mean(short_term_rate, na.rm = TRUE),
Mkt = mean(Mkt, na.rm = TRUE),
SMB = mean(SMB, na.rm = TRUE),
HML = mean(HML, na.rm = TRUE),
CAPEI = mean(CAPEI, na.rm = TRUE),
bm = mean(bm, na.rm = TRUE),
evm = mean(evm, na.rm = TRUE),
pe_exi = mean(pe_exi, na.rm = TRUE),
dpr = mean(dpr, na.rm = TRUE),
npm = mean(npm, na.rm = TRUE),
roa = mean(roa, na.rm = TRUE),
roe = mean(roe, na.rm = TRUE),
roce = mean(roce, na.rm = TRUE),
ptb = mean(ptb, na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(
interest_rate_spread = long_term_rate - short_term_rate,
std_ret = rollapply(RET, width = 30, FUN = sd, fill = NA, align = "right", na.rm = TRUE),
volatility = std_ret * sqrt(252)
) %>%
filter(!is.na(volatility))
# Make sure it is a dataframe (otherwise cannot run the PCA)
task5_dataset5 <- as.data.frame(task5_dataset4 %>% select(-c(date, long_term_rate, short_term_rate, Mkt, std_ret)))
# Inspect dataset
summary(task5_dataset4)## date dollar_vol RET SHROUT
## Min. :2012-06-29 Min. : 36847685 Min. :-0.202143 Min. :585492
## 1st Qu.:2015-02-13 1st Qu.: 46833884 1st Qu.:-0.006104 1st Qu.:607794
## Median :2017-09-29 Median : 56757620 Median : 0.013068 Median :632938
## Mean :2017-09-29 Mean : 64129273 Mean : 0.012131 Mean :635908
## 3rd Qu.:2020-05-14 3rd Qu.: 82015779 3rd Qu.: 0.039496 3rd Qu.:649822
## Max. :2022-12-30 Max. :137978627 Max. : 0.172828 Max. :720953
## long_term_rate short_term_rate Mkt SMB
## Min. :1.151 Min. :0.0495 Min. :-0.50895 Min. :-0.304500
## 1st Qu.:2.207 1st Qu.:0.1274 1st Qu.:-0.05159 1st Qu.:-0.084814
## Median :2.661 Median :0.4521 Median : 0.07000 Median : 0.011579
## Mean :2.595 Mean :0.9261 Mean : 0.05092 Mean :-0.001827
## 3rd Qu.:3.009 3rd Qu.:1.5475 3rd Qu.: 0.15394 3rd Qu.: 0.069500
## Max. :4.235 Max. :4.5120 Max. : 0.64048 Max. : 0.375263
## HML CAPEI bm evm
## Min. :-0.693182 Min. :-352.06 Min. :0.4197 Min. : 0.5219
## 1st Qu.:-0.085526 1st Qu.: 12.34 1st Qu.:0.4815 1st Qu.:11.1566
## Median :-0.014000 Median : 19.21 Median :0.5039 Median :12.8450
## Mean : 0.002641 Mean : 15.95 Mean :0.5342 Mean :12.5192
## 3rd Qu.: 0.076450 3rd Qu.: 26.98 3rd Qu.:0.5654 3rd Qu.:13.7939
## Max. : 0.662000 Max. : 315.59 Max. :1.0024 Max. :21.6639
## pe_exi dpr npm roa
## Min. : 6.93 Min. :0.3953 Min. :-48.60342 Min. :0.1045
## 1st Qu.:16.98 1st Qu.:0.5220 1st Qu.: 0.03955 1st Qu.:0.1257
## Median :19.33 Median :0.6409 Median : 0.09532 Median :0.1334
## Mean :19.25 Mean :0.7103 Mean : -3.62829 Mean :0.1328
## 3rd Qu.:21.13 3rd Qu.:0.9030 3rd Qu.: 0.10499 3rd Qu.:0.1436
## Max. :28.84 Max. :1.3859 Max. : 0.11613 Max. :0.1556
## roe roce ptb interest_rate_spread
## Min. :0.02441 Min. :0.1234 Min. :3.108 Min. :-0.6405
## 1st Qu.:0.16965 1st Qu.:0.1598 1st Qu.:4.065 1st Qu.: 1.0142
## Median :0.18671 Median :0.1733 Median :4.875 Median : 1.7391
## Mean :0.20100 Mean :0.1689 Mean :4.933 Mean : 1.6691
## 3rd Qu.:0.25664 3rd Qu.:0.1817 3rd Qu.:5.394 3rd Qu.: 2.2865
## Max. :0.34199 Max. :0.1979 Max. :8.156 Max. : 3.4800
## std_ret volatility
## Min. :0.02154 Min. :0.3420
## 1st Qu.:0.03291 1st Qu.:0.5225
## Median :0.04180 Median :0.6636
## Mean :0.04542 Mean :0.7211
## 3rd Qu.:0.05836 3rd Qu.:0.9264
## Max. :0.07752 Max. :1.2306
## 'data.frame': 127 obs. of 17 variables:
## $ dollar_vol : num 43443315 40588141 38140313 39590774 41834684 ...
## $ RET : num 0.03946 0.00286 0.03124 0.02398 -0.00654 ...
## $ SHROUT : num 587939 586444 589763 589269 585492 ...
## $ SMB : num 0.0343 -0.1305 0.0226 0.0258 -0.0548 ...
## $ HML : num 0.029048 0.000952 0.053913 0.084737 0.173333 ...
## $ CAPEI : num 18 17.8 19 20 18.5 ...
## $ bm : num 0.583 0.583 0.638 0.642 0.64 ...
## $ evm : num 10.6 10.6 10.2 11.1 11.1 ...
## $ pe_exi : num 17.8 17.4 17.9 18.4 18 ...
## $ dpr : num 0.422 0.422 0.525 0.526 0.526 ...
## $ npm : num 0.106 0.106 0.102 0.102 0.102 ...
## $ roa : num 0.156 0.156 0.153 0.153 0.153 ...
## $ roe : num 0.167 0.168 0.177 0.177 0.178 ...
## $ roce : num 0.198 0.198 0.196 0.196 0.195 ...
## $ ptb : num 3.14 3.11 3.11 3.22 3.14 ...
## $ interest_rate_spread: num 2.1 2 2.18 2.28 2.29 ...
## $ volatility : num 0.868 0.859 0.856 0.839 0.838 ...
Run PCA
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3080 1.5510 1.23707 1.18621 1.12444 1.04010 0.94410
## Proportion of Variance 0.3134 0.1415 0.09002 0.08277 0.07437 0.06364 0.05243
## Cumulative Proportion 0.3134 0.4549 0.54488 0.62765 0.70203 0.76566 0.81809
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.86015 0.75763 0.69938 0.59553 0.58830 0.50287 0.39443
## Proportion of Variance 0.04352 0.03376 0.02877 0.02086 0.02036 0.01488 0.00915
## Cumulative Proportion 0.86162 0.89538 0.92415 0.94502 0.96537 0.98025 0.98940
## PC15 PC16 PC17
## Standard deviation 0.29707 0.24997 0.17160
## Proportion of Variance 0.00519 0.00368 0.00173
## Cumulative Proportion 0.99459 0.99827 1.00000
## PC1 PC2 PC3 PC4
## dollar_vol 0.367885748 0.10777197 0.15766493 -0.0844284455
## RET 0.049352005 -0.30559848 -0.30389508 -0.3191010818
## SHROUT 0.388944920 0.14001799 0.10668939 0.0007736679
## SMB 0.026954120 -0.33611941 -0.06459950 -0.1765879614
## HML 0.063068301 -0.04692741 -0.42853262 -0.2851617295
## CAPEI -0.039649103 0.24207246 -0.40135237 0.3030777662
## bm 0.003563024 -0.43146624 0.40517214 -0.0951636288
## evm 0.036316321 0.35990854 -0.13086026 -0.0769245694
## pe_exi -0.016350234 0.32239520 -0.14539727 -0.2264548116
## dpr 0.262791642 0.03833889 0.01307735 0.3006252023
## npm -0.280648118 0.03850808 0.21063836 0.2346180930
## roa -0.353043386 0.20307363 0.10852398 -0.2631312891
## roe 0.128955164 0.39303739 0.21901341 -0.3798395423
## roce -0.309098020 0.21411480 0.22592900 -0.2925551934
## ptb 0.404588648 0.04607298 -0.15224663 -0.0069807801
## interest_rate_spread -0.230684572 -0.16019841 -0.34688185 -0.2177245840
## volatility 0.318638364 -0.09836080 0.14576294 -0.3584064968
## PC5 PC6 PC7 PC8
## dollar_vol -0.09722908 -0.08259944 0.12019048 -0.03430862
## RET -0.12271489 0.41947292 -0.02501959 0.08124524
## SHROUT -0.17252443 0.05863392 -0.01007236 -0.12109082
## SMB -0.24090224 0.32545129 -0.54025777 -0.33695114
## HML -0.17319593 0.13166689 0.68463051 -0.02964720
## CAPEI -0.44721616 -0.10154497 -0.09716755 -0.18655377
## bm 0.22003598 0.06225616 0.20856748 -0.15824081
## evm 0.41002026 0.08500053 0.09282768 -0.69879284
## pe_exi 0.46036278 0.40111848 -0.18365867 0.24274917
## dpr 0.11277157 0.31787832 0.12452483 0.33018061
## npm -0.24104211 0.38481824 0.23383726 0.01295801
## roa -0.15798552 0.01410542 0.03881481 0.12438976
## roe -0.25337674 0.02070937 -0.15452070 0.22916804
## roce -0.13992036 0.02119900 0.04332998 -0.12578038
## ptb 0.03812125 -0.03840319 -0.12108559 0.12566540
## interest_rate_spread 0.19095208 -0.43527376 -0.09312257 0.20650456
## volatility -0.10840653 -0.25898228 0.09032970 -0.06803291
## PC9 PC10 PC11 PC12
## dollar_vol 0.122574816 -0.126471175 0.10488372 -0.49327446
## RET -0.519738841 -0.427967258 0.20908091 -0.07839313
## SHROUT 0.100861012 -0.125656590 0.02720539 0.05305744
## SMB 0.331819854 0.393232105 0.04005209 -0.03203888
## HML 0.331017715 0.194500391 -0.12112016 0.21416706
## CAPEI -0.338750828 0.094398615 -0.50004402 -0.21135589
## bm -0.252836458 0.089392493 -0.52062344 0.00667326
## evm -0.156021532 0.038062416 0.26337587 -0.05400858
## pe_exi 0.179416238 -0.034213377 -0.46693841 -0.25293551
## dpr -0.307155516 0.641865591 0.24427754 -0.07667120
## npm 0.223769984 -0.135145687 0.10899305 -0.47460250
## roa -0.130066686 0.133797439 0.11589223 -0.01392507
## roe -0.006950409 -0.001973818 -0.03687582 0.21435281
## roce -0.262189975 0.253108030 0.02819838 0.05611067
## ptb -0.071769041 0.002810908 -0.01516842 0.13410173
## interest_rate_spread 0.044538823 0.178403574 0.16466222 -0.36738743
## volatility -0.111912060 0.170338854 -0.07278982 -0.39477123
## PC13 PC14 PC15 PC16
## dollar_vol 0.119623359 -0.56373111 -0.284317522 0.276109502
## RET -0.006987718 0.03278773 -0.028093495 0.077465259
## SHROUT 0.101353235 0.49698979 -0.282670435 -0.276157658
## SMB -0.024502409 -0.11430105 -0.006171665 0.005747384
## HML -0.013472365 -0.04056733 -0.036231837 0.028210973
## CAPEI -0.086074902 -0.04241015 -0.011378007 0.029622328
## bm -0.290979570 -0.13337706 -0.167770018 -0.139050456
## evm -0.253228040 -0.02768383 0.084217002 -0.060632841
## pe_exi 0.185951256 0.08039388 0.016728465 0.020839337
## dpr -0.062933612 0.04470534 -0.002786329 0.113778500
## npm -0.262904282 0.15056060 -0.034283540 -0.315634128
## roa 0.141094377 -0.36517626 0.355580478 -0.401075055
## roe -0.637695442 0.01458670 -0.009063363 0.181892751
## roce 0.404209377 0.13536522 -0.528781095 0.038546932
## ptb 0.020149905 -0.27876606 -0.167673497 -0.699345685
## interest_rate_spread -0.313049876 0.15079351 -0.344419160 -0.130997417
## volatility 0.149575729 0.33999825 0.497414472 -0.045830166
## PC17
## dollar_vol 0.121015045
## RET 0.026348723
## SHROUT 0.571674588
## SMB 0.021704151
## HML -0.007438289
## CAPEI 0.061143226
## bm 0.163769509
## evm -0.010006597
## pe_exi 0.027577317
## dpr 0.099176494
## npm -0.247494729
## roa 0.471130702
## roe -0.090716103
## roce -0.291033380
## ptb -0.398608354
## interest_rate_spread 0.154031470
## volatility -0.239385411
# Extract eigenvalues and variance explained
eigenvalues <- task5_pca$sdev^2
variance_explained <- eigenvalues / sum(eigenvalues)
cumulative_variance <- cumsum(variance_explained)
components <- 1:length(eigenvalues)
scree_data <- data.frame(Components = components, Eigenvalues = eigenvalues)
# Create screeplot
task5_screeplot <- ggplot(scree_data, aes(x = Components, y = Eigenvalues)) +
geom_line(color = "blue") +
geom_point(color = "blue", size = 2) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(title = "Scree Plot of Principal Components",
x = "Principal Components",
y = "Proportion of Variance Explained") +
theme_minimal() +
theme(
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
title = element_text(size = 14)) +
scale_x_continuous(breaks = components) +
annotate("text",
x = max(components),
y = 1,
label = "Kaiser Criterion",
color = "red",
hjust = 1.1,
vjust = -1) +
annotate("text",
x = 1,
y = max(eigenvalues),
label = "Eigenvalues",
color = "blue",
hjust = -0.2)
print(task5_screeplot)# Save the file for report
save_plot(task5_screeplot, "task5 - scree plot")
# Create the eigenvalues summary table
pca_result <- data.frame(
Principal_Component = 1:length(eigenvalues),
Eigenvalue = eigenvalues,
Variance_Explained = variance_explained,
Cumulative_Variance = cumulative_variance
)
print(pca_result)## Principal_Component Eigenvalue Variance_Explained Cumulative_Variance
## 1 1 5.32701796 0.313353998 0.3133540
## 2 2 2.40566351 0.141509618 0.4548636
## 3 3 1.53034174 0.090020103 0.5448837
## 4 4 1.40710145 0.082770674 0.6276544
## 5 5 1.26436039 0.074374141 0.7020285
## 6 6 1.08180109 0.063635359 0.7656639
## 7 7 0.89132849 0.052431087 0.8180950
## 8 8 0.73986263 0.043521331 0.8616163
## 9 9 0.57400350 0.033764912 0.8953812
## 10 10 0.48913178 0.028772458 0.9241537
## 11 11 0.35466017 0.020862363 0.9450160
## 12 12 0.34609445 0.020358497 0.9653745
## 13 13 0.25287876 0.014875221 0.9802498
## 14 14 0.15557532 0.009151489 0.9894013
## 15 15 0.08824877 0.005191104 0.9945924
## 16 16 0.06248348 0.003675499 0.9982679
## 17 17 0.02944649 0.001732147 1.0000000
Run a regression with the PCA result
# Only extract the significant PC to a new dataframe
pca_regression_df <- data.frame(task5_pca$x)[, 1:6]
# Replace the PCA's RET with the original dataset RET (the S&P500 stocks' average returns)
pca_regression_df$RET <- task5_dataset5$RET
# Add the market factor inside the dataset
pca_regression_df <- pca_regression_df %>%
bind_cols(Mkt = task5_dataset4$Mkt)
# Run the regression
pca_regression_model <- lm(RET ~ ., data = pca_regression_df)
summary(pca_regression_model)##
## Call:
## lm(formula = RET ~ ., data = pca_regression_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0266487 -0.0063438 0.0005195 0.0058295 0.0269589
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0037602 0.0009034 4.162 5.99e-05 ***
## PC1 0.0019018 0.0003706 5.132 1.13e-06 ***
## PC2 -0.0052062 0.0006543 -7.957 1.16e-12 ***
## PC3 -0.0079547 0.0007346 -10.828 < 2e-16 ***
## PC4 -0.0071145 0.0007833 -9.083 2.77e-15 ***
## PC5 -0.0051697 0.0007604 -6.799 4.47e-10 ***
## PC6 0.0082613 0.0009339 8.846 1.00e-14 ***
## Mkt 0.1643814 0.0059577 27.591 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.009589 on 119 degrees of freedom
## Multiple R-squared: 0.9637, Adjusted R-squared: 0.9616
## F-statistic: 451.6 on 7 and 119 DF, p-value: < 2.2e-16
Create a plot to show the loadings of each variable
# Prepare the data
pca_loading_factor <- as.data.frame(task5_pca$rotation) %>%
select(c(PC1, PC2, PC3, PC4, PC5, PC6))
pca_loading_factor$Variable <- rownames(pca_loading_factor)
pca_loadings_df <- pivot_longer(pca_loading_factor %>%
filter(Variable != "RET"),
cols = -Variable,
names_to = "PC",
values_to = "Loading")
# Generate the plot - can observe in each PC, the variable is positive / negative, and the different factor representation in each PC
ggplot(pca_loadings_df,
aes(x = Variable,
y = Loading,
fill = PC)) +
geom_bar(stat = "identity",
position = "dodge") +
labs(title = "Loadings of each variable for significant PCs",
x = "Variables",
y = "Loadings")+
facet_wrap(~ PC) +
coord_flip() +
theme(axis.text.y = element_text(angle = 0)) 6 Difference-in-Difference Analysis
Data Preparation
# Load library
library(lme4)
library(plm)
# Import data
tick_pilot <- read.csv("Tick_Pilot_Test_Group.csv")
stock_1415 <- read.csv("DailyStocks_2014_2015.csv")
stock_1617 <- read.csv("DailyStocks_2016_2017.csv")
stock_1820 <- read.csv("DailyStocks_2018_2020.csv")
# Make sure all the "date" are in same format
tick_pilot$Effective_Date <- as.Date(tick_pilot$Effective_Date, format = "%Y-%m-%d")
stock_1415$date <- as.Date(stock_1415$date, format = "%Y-%m-%d")
stock_1617$date <- as.Date(stock_1617$date, format = "%Y-%m-%d")
stock_1820$date <- as.Date(stock_1820$date, format = "%Y-%m-%d")Inspect data
## The column name is the same: TRUE
## The column name is the same: TRUE
## The column name is the same: TRUE
## Differences in structure between stock_1415 and stock_1820: NUMTRD
## Differences in structure between stock_1617 and stock_1820: NUMTRD
## Since we will not use NUMTRD in this analysis, no need to action.
Short-term effect
Create dataset
The program effective date is 2016-10-01, consider the potential for initial volatility, allow the market a month to adjust, the start date of post period is chosen as November 1, 2016, and observe the short term effect within six months. For pre-period, the same dates in the previous year of post-period can better control for seasonal effects and other factors that may influence bid-ask spreads.
# Combine datasets
short_period <- stock_1415 %>%
bind_rows(stock_1617) %>%
filter(date >= "2015-11-01" & date < "2016-10-01" | date >= "2016-11-01" & date <= "2017-01-31") %>%
select(PERMNO, date, TICKER, BID, ASK) %>%
filter(TICKER %in% tick_pilot$Ticker_Symbol)
# Inspect dataset
head(short_period, n = 5)## PERMNO date TICKER BID ASK
## 1 10025 2015-11-02 AEPI 83.76 84.00
## 2 10025 2015-11-03 AEPI 80.13 80.41
## 3 10025 2015-11-04 AEPI 79.62 79.87
## 4 10025 2015-11-05 AEPI 79.35 79.51
## 5 10025 2015-11-06 AEPI 86.00 86.34
Data wrangling
# Clean data
short_period_df1 <- short_period %>%
mutate(
# Handle NA values
ASK = ifelse(ASK - BID < 0, NA, as.numeric(as.character(ASK))),
BID = ifelse(ASK - BID < 0, NA, as.numeric(as.character(BID)))
) %>%
# Remove duplicate
distinct(PERMNO, date, .keep_all = TRUE) %>%
group_by(PERMNO) %>%
# Compute relative bid-ask spread
mutate(
bid_ask_spread = (ASK - BID) / ((ASK + BID) / 2)
) %>%
ungroup() %>%
select(date, TICKER, bid_ask_spread) %>%
# Define group for each security
left_join(tick_pilot %>%
select(Ticker_Symbol, Tick_Size_Pilot_Program_Group), by = c("TICKER" = "Ticker_Symbol")) %>%
rename(Test_group_ = Tick_Size_Pilot_Program_Group) %>%
filter(!is.na(bid_ask_spread)) %>%
# Define period for pre and post period
mutate(
d_after = ifelse(date < "2016-10-01", 0, 1),
Test_group_ = factor(Test_group_)
)
# Inspect number of stocks for each group (Short period)
print(short_period_df1 %>%
group_by(Test_group_) %>%
summarise(Ticker_per_group = n_distinct(TICKER)))## # A tibble: 4 × 2
## Test_group_ Ticker_per_group
## <fct> <int>
## 1 C 959
## 2 G1 328
## 3 G2 326
## 4 G3 314
## # A tibble: 5 × 5
## date TICKER bid_ask_spread Test_group_ d_after
## <date> <chr> <dbl> <fct> <dbl>
## 1 2015-11-02 AEPI 0.00286 G2 0
## 2 2015-11-03 AEPI 0.00349 G2 0
## 3 2015-11-04 AEPI 0.00313 G2 0
## 4 2015-11-05 AEPI 0.00201 G2 0
## 5 2015-11-06 AEPI 0.00395 G2 0
Descriptive Statistics
# Observe the average and standard deviation bid-ask spread value for each group
print(short_period_df1 %>%
group_by(Test_group_) %>%
summarise(
mean_spread = mean(bid_ask_spread, na.rm = TRUE),
sd_spread = sd(bid_ask_spread, na.rm = TRUE)
))## # A tibble: 4 × 3
## Test_group_ mean_spread sd_spread
## <fct> <dbl> <dbl>
## 1 C 0.00692 0.0222
## 2 G1 0.00680 0.0200
## 3 G2 0.00754 0.0205
## 4 G3 0.00706 0.0240
DiD regressions
# Run regression with fixed effect
DiD_model_1 <- plm(bid_ask_spread ~ Test_group_ * d_after,
data = short_period_df1,
index = c("date", "TICKER"),
model = "within")
summary(DiD_model_1)## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = bid_ask_spread ~ Test_group_ * d_after, data = short_period_df1,
## model = "within", index = c("date", "TICKER"))
##
## Unbalanced Panel: n = 293, T = 1886-1926, N = 559863
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.0117571 -0.0062784 -0.0051638 -0.0023067 1.9891584
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## Test_group_G1 -6.4777e-04 9.2381e-05 -7.0120 2.352e-12 ***
## Test_group_G2 -7.1589e-05 9.2209e-05 -0.7764 0.4375
## Test_group_G3 -4.8118e-04 9.3995e-05 -5.1192 3.069e-07 ***
## Test_group_G1:d_after 2.5343e-03 2.0112e-04 12.6013 < 2.2e-16 ***
## Test_group_G2:d_after 3.3135e-03 2.0120e-04 16.4692 < 2.2e-16 ***
## Test_group_G3:d_after 2.9495e-03 2.0415e-04 14.4478 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 267.31
## Residual Sum of Squares: 267.07
## R-Squared: 0.00089239
## Adj. R-Squared: 0.00036031
## F-statistic: 83.299 on 6 and 559564 DF, p-value: < 2.22e-16
# Determine the impact of fixed effect
DiD_model_1_random <- plm(bid_ask_spread ~ Test_group_ * d_after,
data = short_period_df1,
index = c("date", "TICKER"),
model = "random")
summary(DiD_model_1_random)## Oneway (individual) effect Random Effect Model
## (Swamy-Arora's transformation)
##
## Call:
## plm(formula = bid_ask_spread ~ Test_group_ * d_after, data = short_period_df1,
## model = "random", index = c("date", "TICKER"))
##
## Unbalanced Panel: n = 293, T = 1886-1926, N = 559863
##
## Effects:
## var std.dev share
## idiosyncratic 4.773e-04 2.185e-02 1
## individual 5.944e-08 2.438e-04 0
## theta:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1001 0.1009 0.1012 0.1012 0.1017 0.1019
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.00957 -0.00622 -0.00551 0.00000 -0.00242 1.99047
##
## Coefficients:
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept) 7.0968e-03 4.9307e-05 143.9319 < 2.2e-16 ***
## Test_group_G1 -6.4980e-04 9.2423e-05 -7.0306 2.056e-12 ***
## Test_group_G2 -7.0419e-05 9.2251e-05 -0.7633 0.4453
## Test_group_G3 -4.8404e-04 9.4038e-05 -5.1473 2.643e-07 ***
## d_after -8.4170e-04 1.0742e-04 -7.8359 4.657e-15 ***
## Test_group_G1:d_after 2.5375e-03 2.0121e-04 12.6111 < 2.2e-16 ***
## Test_group_G2:d_after 3.3125e-03 2.0129e-04 16.4565 < 2.2e-16 ***
## Test_group_G3:d_after 2.9517e-03 2.0425e-04 14.4517 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 267.72
## Residual Sum of Squares: 267.45
## R-Squared: 0.0010111
## Adj. R-Squared: 0.00099864
## Chisq: 563.244 on 7 DF, p-value: < 2.22e-16
The result is quite similar, but not exactly the same, suggesting that fixed effect is somehow influencing, it is better to use the model with fixed effect
Long-term effect
The program effective for two years. Observe the long-term effect:
# To compute volatility for specific period, separate into two dataset first
# The pre-period should be the same as the short-term effect (it is the baseline, should be consistent)
pre_treatment <- stock_1415 %>%
bind_rows(stock_1617) %>%
filter(date >= "2015-10-01" & date <= "2016-09-30") %>%
select(PERMNO, date, TICKER, PRC, VOL, RET, BID, ASK, SHROUT) %>%
filter(TICKER %in% tick_pilot$Ticker_Symbol)
# Using the first effective month to compute the rolling standard deviation, therefore, the post-period actually is from November 1, 2016
post_treatment <- stock_1617 %>%
bind_rows(stock_1820) %>%
filter(date >= "2016-10-01" & date <= "2018-09-30") %>%
select(PERMNO, date, TICKER, PRC, VOL, RET, BID, ASK, SHROUT) %>%
filter(TICKER %in% tick_pilot$Ticker_Symbol)Data wrangling
# Define trading days (for calculate volatility) - typically trading day is 252, this code also make sure the number of observation for the data
trading_days_pre <- as.integer(length(unique(pre_treatment$date)) / 2)
trading_days_post <- as.integer(length(unique(post_treatment$date)) / 2)
# Clean data (pre-period)
pre_treatment_df1 <- pre_treatment %>%
mutate(
PERMNO = as.character(PERMNO),
PRC = abs(PRC),
# Handle missing values
PRC = ifelse(PRC == 0.0, NA, PRC),
VOL = ifelse(VOL == -99, NA, VOL),
RET = case_when(
RET %in% c(-44, -55, -66, -77, -88, -99) ~ NA,
RET %in% c(".A", ".B", ".C", ".D") ~ NA,
TRUE ~ as.numeric(as.character(RET))
),
ASK = ifelse(ASK - BID < 0, NA, as.numeric(as.character(ASK))),
BID = ifelse(ASK - BID < 0, NA, as.numeric(as.character(BID)))
) %>%
# Remove duplicate
distinct(PERMNO, date, .keep_all = TRUE) %>%
arrange(PERMNO, date) %>%
group_by(PERMNO) %>%
# Compute relevant variables
mutate(
mean_ret = rollmean(RET, k = 30, fill = NA, align = "right", na.rm = TRUE),
std_ret = rollapply(RET, width = 30, FUN = sd, fill = NA, align = "right", na.rm = TRUE),
volatility = std_ret * sqrt(trading_days_pre),
dollar_vol = PRC * VOL,
market_cap = SHROUT * PRC * 1000,
bid_ask_spread = (ASK - BID) / ((ASK + BID) / 2)
) %>%
ungroup() %>%
select(PERMNO, date, TICKER, bid_ask_spread, volatility, dollar_vol, market_cap)
# Clean data (post-period)
post_treatment_df1 <- post_treatment %>%
mutate(
PERMNO = as.character(PERMNO),
PRC = abs(PRC),
# Handle missing values
PRC = ifelse(PRC == 0.0, NA, PRC),
VOL = ifelse(VOL == -99, NA, VOL),
RET = case_when(
RET %in% c(-44, -55, -66, -77, -88, -99) ~ NA,
RET %in% c(".A", ".B", ".C", ".D") ~ NA,
TRUE ~ as.numeric(as.character(RET))
),
ASK = ifelse(ASK - BID < 0, NA, as.numeric(as.character(ASK))),
BID = ifelse(ASK - BID < 0, NA, as.numeric(as.character(BID)))
) %>%
# Remove duplicate
distinct(PERMNO, date, .keep_all = TRUE) %>%
group_by(PERMNO) %>%
# Compute relevant variables
mutate(
mean_ret = rollmean(RET, k = 30, fill = NA, align = "right", na.rm = TRUE),
std_ret = rollapply(RET, width = 30, FUN = sd, fill = NA, align = "right", na.rm = TRUE),
volatility = std_ret * sqrt(trading_days_post),
dollar_vol = PRC * VOL,
market_cap = SHROUT * PRC * 1000,
bid_ask_spread = (ASK - BID) / ((ASK + BID) / 2)
) %>%
ungroup() %>%
select(PERMNO, date, TICKER, bid_ask_spread, volatility, dollar_vol, market_cap)
# Combined dataset and change the column name (it will look clearer in the regression coefficients)
task6_panel <- pre_treatment_df1 %>% bind_rows(post_treatment_df1) %>%
left_join(tick_pilot %>%
select(Ticker_Symbol, Tick_Size_Pilot_Program_Group), by = c("TICKER" = "Ticker_Symbol")) %>%
rename(Test_group_ = Tick_Size_Pilot_Program_Group)
# Remove missing values, create period binary variable, and standardise variables to make sure they are on the same scale
task6_panel_cleaned <- task6_panel %>%
filter(!is.na(bid_ask_spread) & !is.na(volatility))%>%
mutate(
d_after = ifelse(date < "2016-10-01", 0, 1),
scaled_spread = scale(bid_ask_spread),
scaled_volatility = scale(volatility),
scaled_dollar_vol = scale(dollar_vol),
scaled_market_cap = scale(market_cap),
Test_group_ = factor(Test_group_)
)
# Inspect number of stocks for each group
print(task6_panel_cleaned %>%
group_by(Test_group_) %>%
summarise(Ticker_per_group = n_distinct(TICKER)))## # A tibble: 4 × 2
## Test_group_ Ticker_per_group
## <fct> <int>
## 1 C 965
## 2 G1 330
## 3 G2 326
## 4 G3 315
Descriptive Statistics
# Observe the average value for each group
print(task6_panel %>%
group_by(Test_group_) %>%
summarise(
mean_spread = mean(bid_ask_spread, na.rm = TRUE),
sd_spread = sd(bid_ask_spread, na.rm = TRUE)
))## # A tibble: 4 × 3
## Test_group_ mean_spread sd_spread
## <chr> <dbl> <dbl>
## 1 C 0.00621 0.0201
## 2 G1 0.00698 0.0171
## 3 G2 0.00823 0.0226
## 4 G3 0.00745 0.0197
# Visualise the average bid-ask spread across the entire period
task6_time_series <- ggplot(task6_panel,
aes(x = date,
y = bid_ask_spread,
color = as.factor(Test_group_))) +
geom_line(stat = "summary",
fun = "mean",
show.legend = FALSE) +
labs(title = "Overall Trends in Bid-Ask Spread",
subtitle = "(Program Began on October 2016)",
x = "Date",
y = "Average Relative Bid-Ask Spread") +
scale_x_date(breaks = seq(as.Date("2015-11-01"),
as.Date("2019-11-01"),
by = "6 months"),
labels = date_format("%Y-%m-%d")) +
# Increase readiness
facet_wrap(~ as.factor(Test_group_), scales = "free",
labeller = as_labeller(c("C" = "Control Group",
"G1" = "Test Group One",
"G2" = "Test Group Two",
"G3" = "Test Group Three"))) +
# Add a line to underlying the effective date
geom_vline(xintercept = as.Date("2016-10-01"), linetype = "dashed", color = "black") +
theme(axis.text.x = element_text(angle = 55, hjust=1))
print(task6_time_series)DiD regressions
# Run regression with fixed effect
DiD_model_2 <- plm(bid_ask_spread ~ Test_group_ * d_after,
data = task6_panel_cleaned,
index = c("date", "PERMNO"),
model = "within")
summary(DiD_model_2)## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = bid_ask_spread ~ Test_group_ * d_after, data = task6_panel_cleaned,
## model = "within", index = c("date", "PERMNO"))
##
## Unbalanced Panel: n = 697, T = 1645-1923, N = 1264428
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.0108503 -0.0058366 -0.0047848 -0.0016908 1.9891194
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## Test_group_G1 -5.9440e-04 8.4094e-05 -7.0682 1.570e-12 ***
## Test_group_G2 -3.2642e-05 8.3894e-05 -0.3891 0.6972
## Test_group_G3 -4.8288e-04 8.5592e-05 -5.6417 1.685e-08 ***
## Test_group_G1:d_after 2.2008e-03 1.0328e-04 21.3096 < 2.2e-16 ***
## Test_group_G2:d_after 3.2146e-03 1.0324e-04 31.1371 < 2.2e-16 ***
## Test_group_G3:d_after 2.8112e-03 1.0511e-04 26.7463 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 484.64
## Residual Sum of Squares: 483.26
## R-Squared: 0.0028472
## Adj. R-Squared: 0.0022933
## F-statistic: 601.398 on 6 and 1263725 DF, p-value: < 2.22e-16
# Observe the importance of fixed effect
DiD_model_2_random <- plm(bid_ask_spread ~ Test_group_ * d_after,
data = task6_panel_cleaned,
index = c("date", "TICKER"),
model = "random")
summary(DiD_model_2_random)## Oneway (individual) effect Random Effect Model
## (Swamy-Arora's transformation)
##
## Call:
## plm(formula = bid_ask_spread ~ Test_group_ * d_after, data = task6_panel_cleaned,
## model = "random", index = c("date", "TICKER"))
##
## Unbalanced Panel: n = 697, T = 1645-1923, N = 1264428
##
## Effects:
## var std.dev share
## idiosyncratic 3.824e-04 1.956e-02 1
## individual 8.410e-08 2.900e-04 0
## theta:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1431 0.1487 0.1570 0.1548 0.1606 0.1617
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.00894 -0.00589 -0.00495 0.00000 -0.00173 1.99038
##
## Coefficients:
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept) 7.0332e-03 4.6635e-05 150.8119 < 2.2e-16 ***
## Test_group_G1 -5.9537e-04 8.4109e-05 -7.0785 1.457e-12 ***
## Test_group_G2 -3.2452e-05 8.3908e-05 -0.3868 0.6989
## Test_group_G3 -4.8589e-04 8.5607e-05 -5.6759 1.380e-08 ***
## d_after -1.4000e-03 5.7202e-05 -24.4750 < 2.2e-16 ***
## Test_group_G1:d_after 2.2015e-03 1.0330e-04 21.3120 < 2.2e-16 ***
## Test_group_G2:d_after 3.2150e-03 1.0326e-04 31.1352 < 2.2e-16 ***
## Test_group_G3:d_after 2.8152e-03 1.0512e-04 26.7803 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 485.07
## Residual Sum of Squares: 483.69
## R-Squared: 0.0028302
## Adj. R-Squared: 0.0028247
## Chisq: 3609.49 on 7 DF, p-value: < 2.22e-16
Other factors that may influence bid-ask spreads
# Interact with volatility
DiD_model_3 <- plm(scaled_spread ~ Test_group_ * d_after + Test_group_ * d_after * scaled_volatility,
data = task6_panel_cleaned,
index = c("date", "TICKER"),
model = "within")
summary(DiD_model_3)## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = scaled_spread ~ Test_group_ * d_after + Test_group_ *
## d_after * scaled_volatility, data = task6_panel_cleaned,
## model = "within", index = c("date", "TICKER"))
##
## Unbalanced Panel: n = 697, T = 1645-1923, N = 1264428
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -11.300807 -0.276644 -0.202029 -0.067455 101.056980
##
## Coefficients:
## Estimate Std. Error t-value
## Test_group_G1 -0.0129976 0.0043730 -2.9722
## Test_group_G2 0.0033957 0.0043194 0.7861
## Test_group_G3 0.0272294 0.0044381 6.1354
## scaled_volatility 0.1910490 0.0021761 87.7937
## Test_group_G1:d_after 0.0904628 0.0053047 17.0533
## Test_group_G2:d_after 0.1292625 0.0052730 24.5141
## Test_group_G3:d_after 0.0847384 0.0053896 15.7226
## Test_group_G1:scaled_volatility 0.0449286 0.0059994 7.4888
## Test_group_G2:scaled_volatility 0.0251111 0.0061947 4.0536
## Test_group_G3:scaled_volatility 0.2047588 0.0057873 35.3805
## d_after:scaled_volatility -0.0658201 0.0026826 -24.5358
## Test_group_G1:d_after:scaled_volatility 0.0540895 0.0066556 8.1270
## Test_group_G2:d_after:scaled_volatility 0.0887940 0.0066382 13.3763
## Test_group_G3:d_after:scaled_volatility -0.0740496 0.0067536 -10.9644
## Pr(>|t|)
## Test_group_G1 0.002956 **
## Test_group_G2 0.431784
## Test_group_G3 8.498e-10 ***
## scaled_volatility < 2.2e-16 ***
## Test_group_G1:d_after < 2.2e-16 ***
## Test_group_G2:d_after < 2.2e-16 ***
## Test_group_G3:d_after < 2.2e-16 ***
## Test_group_G1:scaled_volatility 6.953e-14 ***
## Test_group_G2:scaled_volatility 5.044e-05 ***
## Test_group_G3:scaled_volatility < 2.2e-16 ***
## d_after:scaled_volatility < 2.2e-16 ***
## Test_group_G1:d_after:scaled_volatility 4.405e-16 ***
## Test_group_G2:d_after:scaled_volatility < 2.2e-16 ***
## Test_group_G3:d_after:scaled_volatility < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 1262800
## Residual Sum of Squares: 1206500
## R-Squared: 0.044616
## Adj. R-Squared: 0.04408
## F-statistic: 4215.4 on 14 and 1263717 DF, p-value: < 2.22e-16
# Interact with dollar volume
DiD_model_4 <- plm(scaled_spread ~ Test_group_ * d_after + Test_group_ * d_after * scaled_dollar_vol,
data = task6_panel_cleaned,
index = c("date", "TICKER"),
model = "within")
summary(DiD_model_4)## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = scaled_spread ~ Test_group_ * d_after + Test_group_ *
## d_after * scaled_dollar_vol, data = task6_panel_cleaned,
## model = "within", index = c("date", "TICKER"))
##
## Unbalanced Panel: n = 697, T = 1645-1923, N = 1264428
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.567811 -0.294686 -0.236157 -0.076207 101.490316
##
## Coefficients:
## Estimate Std. Error t-value
## Test_group_G1 -0.02784746 0.00430939 -6.4620
## Test_group_G2 0.00013458 0.00430455 0.0313
## Test_group_G3 -0.01329252 0.00437034 -3.0415
## scaled_dollar_vol -0.23570473 0.00351861 -66.9881
## Test_group_G1:d_after 0.11136970 0.00527469 21.1140
## Test_group_G2:d_after 0.16319129 0.00527620 30.9297
## Test_group_G3:d_after 0.13401777 0.00535500 25.0267
## Test_group_G1:scaled_dollar_vol -0.01669069 0.00725472 -2.3007
## Test_group_G2:scaled_dollar_vol 0.01435802 0.00677809 2.1183
## Test_group_G3:scaled_dollar_vol 0.08083983 0.00613116 13.1851
## d_after:scaled_dollar_vol 0.16087000 0.00374039 43.0089
## Test_group_G1:d_after:scaled_dollar_vol -0.01403663 0.00778018 -1.8042
## Test_group_G2:d_after:scaled_dollar_vol -0.08149131 0.00737161 -11.0547
## Test_group_G3:d_after:scaled_dollar_vol -0.11238667 0.00669849 -16.7779
## Pr(>|t|)
## Test_group_G1 1.033e-10 ***
## Test_group_G2 0.975058
## Test_group_G3 0.002354 **
## scaled_dollar_vol < 2.2e-16 ***
## Test_group_G1:d_after < 2.2e-16 ***
## Test_group_G2:d_after < 2.2e-16 ***
## Test_group_G3:d_after < 2.2e-16 ***
## Test_group_G1:scaled_dollar_vol 0.021411 *
## Test_group_G2:scaled_dollar_vol 0.034150 *
## Test_group_G3:scaled_dollar_vol < 2.2e-16 ***
## d_after:scaled_dollar_vol < 2.2e-16 ***
## Test_group_G1:d_after:scaled_dollar_vol 0.071208 .
## Test_group_G2:d_after:scaled_dollar_vol < 2.2e-16 ***
## Test_group_G3:d_after:scaled_dollar_vol < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 1262800
## Residual Sum of Squares: 1241000
## R-Squared: 0.017273
## Adj. R-Squared: 0.016721
## F-statistic: 1586.53 on 14 and 1263717 DF, p-value: < 2.22e-16
# Interact with market capitalisation
DiD_model_5 <- plm(scaled_spread ~ Test_group_ * d_after + Test_group_ * d_after * scaled_market_cap,
data = task6_panel_cleaned,
index = c("date", "TICKER"),
model = "within")
summary(DiD_model_5)## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = scaled_spread ~ Test_group_ * d_after + Test_group_ *
## d_after * scaled_market_cap, data = task6_panel_cleaned,
## model = "within", index = c("date", "TICKER"))
##
## Unbalanced Panel: n = 697, T = 1645-1923, N = 1264428
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.757122 -0.326744 -0.212696 0.028823 101.366819
##
## Coefficients:
## Estimate Std. Error t-value
## Test_group_G1 -0.0118479 0.0042641 -2.7785
## Test_group_G2 -0.0200371 0.0043489 -4.6074
## Test_group_G3 -0.0139776 0.0043616 -3.2047
## scaled_market_cap -0.3590842 0.0029151 -123.1815
## Test_group_G1:d_after 0.1031756 0.0052069 19.8152
## Test_group_G2:d_after 0.1810066 0.0052766 34.3036
## Test_group_G3:d_after 0.1426427 0.0053156 26.8348
## Test_group_G1:scaled_market_cap 0.0278860 0.0056555 4.9308
## Test_group_G2:scaled_market_cap -0.0322744 0.0060101 -5.3700
## Test_group_G3:scaled_market_cap 0.0156337 0.0059385 2.6326
## d_after:scaled_market_cap 0.1766160 0.0032151 54.9327
## Test_group_G1:d_after:scaled_market_cap -0.0585878 0.0062825 -9.3256
## Test_group_G2:d_after:scaled_market_cap -0.0425621 0.0066070 -6.4419
## Test_group_G3:d_after:scaled_market_cap -0.0529140 0.0065102 -8.1278
## Pr(>|t|)
## Test_group_G1 0.005461 **
## Test_group_G2 4.077e-06 ***
## Test_group_G3 0.001352 **
## scaled_market_cap < 2.2e-16 ***
## Test_group_G1:d_after < 2.2e-16 ***
## Test_group_G2:d_after < 2.2e-16 ***
## Test_group_G3:d_after < 2.2e-16 ***
## Test_group_G1:scaled_market_cap 8.192e-07 ***
## Test_group_G2:scaled_market_cap 7.873e-08 ***
## Test_group_G3:scaled_market_cap 0.008474 **
## d_after:scaled_market_cap < 2.2e-16 ***
## Test_group_G1:d_after:scaled_market_cap < 2.2e-16 ***
## Test_group_G2:d_after:scaled_market_cap 1.180e-10 ***
## Test_group_G3:d_after:scaled_market_cap 4.375e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 1262800
## Residual Sum of Squares: 1187700
## R-Squared: 0.059535
## Adj. R-Squared: 0.059007
## F-statistic: 5714.19 on 14 and 1263717 DF, p-value: < 2.22e-16